# Libraries
library(here)
library(lubridate)
library(meta)
library(knitr)
library(rsq)
library(car)
library(vroom)
library(pastecs)
library(qqman)
library(ggrepel)
library(kableExtra)
library(janitor)
library(MASS)
library(viridis)
library(sciplot)
library(ROCR)
library(tidyverse)
theme_set(theme_bw())
# Functions
#source(here("Script/Functions", ""))
We first ran a genome-wide association study in the UK Biobank European cohort with gout as the outcome. This was done with a total of 27,287,012 variants (after imputation), and adjusted for age, sex, and the first 40 principal components. Below, we did the following to clean up this summary stats file:
We read in the summary statistics.
We then did some initial filtering to remove X and Y chromosome SNPs, and some of the indels.
We removed variants that were not in the imputed CoreExome genotype data.
We then filtered out multi-allelic variants and variants with MAF < 0.01.
For the Polynesian analysis, we additionally filtered out variants that were not directly genotyped in the CoreExome.
# Reading in summary stats
if(file.exists(here("Output/Temp/sumstat5.RData"))) {
#load(here("Output/Temp/sumstat5.RData"))
} else {
if(file.exists(here("Output/Temp/sumstat3.RData"))) {
load(here("Output/Temp/sumstat3.RData"))
} else {
sumstat <- vroom(here("Data/GWAS/ukbb_gout-allcontrol_chr1-22.X.XY.add_unfiltered_p.tsv"), delim = "\t", col_names = TRUE) # start with 27,287,012 variants in the summary stats
sumstat2 <- sumstat %>%
filter(str_length(A1) == 1,
CHR %in% 1:22) # Removing 738,219 indels and keeping only autosomes (removes a further 1,069,966 variants) = 25,478,827 variants
sumstat2_1 <- sumstat2 %>%
mutate(SNP1 = sumstat2 %>% pull(SNP) %>% str_split(",", n = 2, simplify = TRUE) %>% .[,1],
SNP2 = sumstat2 %>% pull(SNP) %>% str_split(",", n = 2, simplify = TRUE) %>% .[,2]) # this takes forever to run
# test <- sumstat2_1 %>%
# filter(str_detect(SNP2, ",")) %>%
# separate(SNP2, into = c("SNP2", "extra"), sep = ",") # 1,696 variants had an extra comma in their ID
#
# test1 <- test %>%
# mutate(extra = as.numeric(extra)) %>%
# filter(extra != CHR) # All just had their chromosome attached, can just remove the extra bit
sumstat2_2 <- sumstat2 %>%
mutate(SNP1 = sumstat2 %>% pull(SNP) %>% str_split(",", simplify = TRUE) %>% .[,1],
SNP2 = sumstat2 %>% pull(SNP) %>% str_split(",", simplify = TRUE) %>% .[,2])
# test <- sumstat2_2 %>% filter(str_detect(SNP2, ",")) # none with ,
test1 <- sumstat2_2 %>%
filter(str_detect(SNP1, regex("^rs[0-9]+"))) # 24,248,522 variants have an rsID in SNP1 column
# test1_1 <- test1 %>%
# filter(str_detect(SNP2, regex("^rs[0-9]+"))) # 633,645 of these have an rsID in SNP2 column
#
# test1_1_1 <- test1_1 %>%
# filter(SNP1 != SNP2) # 1,367 variants have two rsIDs, for the most part SNP1 appears to be the newest rsID, but I might want to keep the extra rsID in a separate column
test1_2 <- test1 %>%
filter(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 23,610,697 variants have rsID in SNP1 column and chr:bp_a1_a2 in SNP2 column
test1_2_1 <- test1_2 %>%
mutate(CHR2 = test1_2 %>% pull(SNP2) %>% str_split(":", simplify = TRUE) %>% .[,1] %>% as.numeric(),
BP2 = test1_2 %>% pull(SNP2) %>% str_split(":", simplify = TRUE) %>% .[,2] %>% str_split("_", simplify = TRUE) %>% .[,1] %>% as.numeric(),
Allele1 = test1_2 %>% pull(SNP2) %>% str_split("_", simplify = TRUE) %>% .[,2],
Allele2 = test1_2 %>% pull(SNP2) %>% str_split("_", simplify = TRUE) %>% .[,3])
# tmp <- test1_2_1 %>%
# filter(BP != BP2) # all BPs are equal
#
# tmp <- test1_2_1 %>%
# filter(CHR != CHR2) # all CHRs are equal
#
# tmp <- test1_2_1 %>%
# filter(A1 != Allele2) # Allele2 is not always A1
test1_2_2 <- test1_2_1 %>%
select(-CHR2, -BP2) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1) # removes a further 219,771 indels = 23,390,926 SNPs
test1_2_final <- test1_2_2 %>%
select(SNP, Allele1, Allele2)
# test1_3 <- test1 %>%
# filter(!str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$|^rs[0-9]+$"))) # 4,180 variants have neither rsID nor chr:bp_a1_a2 in SNP2 column
#
# test1_4 <- test1_3 %>%
# filter(!str_detect(SNP2, regex("^Affx-[0-9]+$"))) # All of these are in the format Affx-<number>
test1_final <- test1 %>%
mutate(RSID = SNP1,
ALT_RSID = case_when(str_detect(SNP2, regex("^rs[0-9]+$")) & SNP1 != SNP2 ~ SNP2, TRUE ~ NA_character_),
AFFYID = case_when(str_detect(SNP2, regex("^Affx-[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
SNP_ID = case_when(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$")) ~ SNP2, TRUE ~ NA_character_)) %>%
left_join(test1_2_final, by = "SNP")
test1_final2 <- test1_final %>%
filter(is.na(SNP_ID) | (!is.na(SNP_ID) & !is.na(Allele1)))
# nrow(test1_final2) - nrow(test1_final) # successfully removed indels from final dataset
# sum(!is.na(test1_final2$RSID))
# sum(!is.na(test1_final2$ALT_RSID))
# sum(!is.na(test1_final2$AFFYID))
# sum(!is.na(test1_final2$SNP_ID))
# sum(!is.na(test1_final2$Allele1))
# sum(!is.na(test1_final2$Allele2))
test2 <- sumstat2_2 %>%
filter(!str_detect(SNP1, regex("^rs[0-9]+"))) # 1,230,305 variants don't have an rsID in SNP1 column
test2_1 <- test2 %>%
filter(str_detect(SNP1, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 1,230,053 have the SNP_ID format in SNP1
test2_1_1 <- test2_1 %>%
mutate(CHR2 = test2_1 %>% pull(SNP1) %>% str_split(":", simplify = TRUE) %>% .[,1] %>% as.numeric(),
BP2 = test2_1 %>% pull(SNP1) %>% str_split(":", simplify = TRUE) %>% .[,2] %>% str_split("_", simplify = TRUE) %>% .[,1] %>% as.numeric(),
Allele1 = test2_1 %>% pull(SNP1) %>% str_split("_", simplify = TRUE) %>% .[,2],
Allele2 = test2_1 %>% pull(SNP1) %>% str_split("_", simplify = TRUE) %>% .[,3])
# tmp <- test2_1_1 %>%
# filter(BP != BP2) # all BPs are equal
#
# tmp <- test2_1_1 %>%
# filter(CHR != CHR2) # all CHRs are equal
test2_1_2 <- test2_1_1 %>%
select(-CHR2, -BP2) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1)
# nrow(test2_1_2) - nrow(test2_1_1) # removed 975,583
test2_1_final <- test2_1_2 %>%
select(SNP, Allele1, Allele2)
# test2_1_1 <- test2_1 %>%
# filter(str_detect(SNP2, regex("^rs[0-9]+"))) # 23 of these have RSID in SNP2
# test2_1_2 <- test2_1 %>%
# filter(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 1,230,029 of these have SNP_ID in SNP2
#
# test2_1_2_1 <- test2_1_2 %>%
# filter(SNP1 != SNP2) # All SNP_ID columns identical in these individuals
# test2_1_3 <- test2_1 %>%
# filter(str_detect(SNP2, regex("^Affx-[0-9]+$"))) # 1 has AffyID in SNP2
# test2_2 <- test2 %>%
# filter(str_detect(SNP1, regex("^Affx-[0-9]+$"))) # 252 have an AffyID in SNP1
#
# test2_2_1 <- test2_2 %>%
# filter(str_detect(SNP2, regex("^Affx-[0-9]+$"))) # All have AffyID in SNP2
#
# test2_2_1_1 <- test2_2_1 %>%
# filter(SNP1 != SNP2) # All AffyID columns identical in these individuals
test2_final <- test2 %>%
mutate(RSID = case_when(str_detect(SNP2, regex("^rs[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
ALT_RSID = NA_character_,
AFFYID = case_when(str_detect(SNP2, regex("^Affx-[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
SNP_ID = case_when(str_detect(SNP1, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$")) ~ SNP1, TRUE ~ NA_character_)) %>%
left_join(test2_1_final, by = "SNP")
test2_final2 <- test2_final %>%
filter(is.na(SNP_ID) | (!is.na(SNP_ID) & !is.na(Allele1)))
sumstat3 <- rbind(test1_final2, test2_final2) %>% arrange(CHR, BP)
save(sumstat3, file = here("Output/Temp/sumstat3.RData"))
}
# tmp <- sumstat3 %>%
# select(CHR, BP) %>%
# unique()
# for(i in 1:22) {
# write_delim(select(filter(tmp, CHR == i), BP), file = paste0(here("Output/Temp/"), "chr", i, "_snplist.txt"), delim = "\n")
# }
# Now we will work out which of these SNPs are present in the Imputed CoreEx data
#system(paste0('source ~/.bashrc; parallel "zcat /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Imputed_Genotypes/QC1-10_Impute_EUR_only/CZ-MB1.2-QC1.10_EUR_imputed_chr{}.vcf.gz | grep -v ', "'#'",' | cut -f2 | grep -Fwf ', here("Output/Temp/"),'chr{}_snplist.txt > ', here("Output/Temp/"),'matched_snps_chr{}.txt" ::: {1..22}'))
out <- tibble()
for(i in 1:22) {
assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("matched_snps_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
out <- rbind(out, get(paste0("chr", i, "_snps")))
rm(list = paste0("chr", i, "_snps"), i)
}
out <- out %>%
select(CHR, X1) %>%
mutate(C_B = paste0(CHR, "_", X1))
sumstat4 <- sumstat3 %>%
mutate(C_B = paste0(CHR, "_", BP)) %>%
filter(C_B %in% out$C_B)
rm(sumstat3)
# Now to pull out the MAF for all SNPs
# Already have CHR and BP for all SNPs separated by chr in matched_snps_chr* and mfi files are per chr so can simply match location in grep
#system(paste0('source ~/.bashrc; parallel "grep -Fwhf ', here("Output/Temp/"), 'matched_snps_chr{}.txt /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/splits/ukb_mfi_chr{}_v3.txt > ', here("Output/Temp/"), 'ukb_maf_info_chr{}.txt" ::: {1..22}'))
out <- tibble()
for(i in 1:22) {
assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("ukb_maf_info_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
out <- rbind(out, get(paste0("chr", i, "_snps")))
rm(list = paste0("chr", i, "_snps"), i)
}
colnames(out) <- c("SNP1", "SNP2", "BP", "Allele1", "Allele2", "MAF", "Minor_Allele", "INFO", "CHR")
# test <- out %>%
# filter(is.na(Allele1) | is.na(Allele2))
out <- out %>%
mutate(C_B = paste0(CHR, "_", BP)) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1,
MAF > 0.01,
MAF < 0.99,
INFO > 0.3)
out2 <- out %>%
filter(C_B %in% sumstat4$C_B) %>%
arrange(CHR, BP)
sumstat5 <- sumstat4 %>%
semi_join(out2, by = c("CHR", "BP"))
save(sumstat5, file = here("Output/Temp", "sumstat5.RData"))
}
# Pulling out biallelic SNPs only and adding MAF and INFO columns, also making sure MAF etc are correct for allele1/2
if(file.exists(here("Output/Temp/biallelic_sumstat_final.RData"))) {
# load(here("Output/Temp/biallelic_sumstat_final.RData"))
# load(here("Output/Temp/biallelic_sumstat_final_poly.RData"))
} else {
tmp <- sumstat5 %>%
filter(duplicated(C_B))
sumstat5_biallelic <- sumstat5 %>%
filter(!(C_B %in% tmp$C_B))
out <- tibble()
for(i in 1:22) {
assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("ukb_maf_info_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
out <- rbind(out, get(paste0("chr", i, "_snps")))
rm(list = paste0("chr", i, "_snps"), i)
}
colnames(out) <- c("SNP1", "SNP2", "BP", "Allele1", "Allele2", "MAF", "Minor_Allele", "INFO", "CHR")
out <- out %>%
mutate(C_B = paste0(CHR, "_", BP)) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1,
MAF > 0.01,
MAF < 0.99,
INFO > 0.3)
out2 <- out %>%
filter(C_B %in% sumstat5_biallelic$C_B) %>%
arrange(CHR, BP)
tmp <- out2 %>%
filter(duplicated(C_B))
out2 <- out2 %>%
filter(!(C_B %in% tmp$C_B))
sumstat5_biallelic <- sumstat5_biallelic %>%
left_join(out2, by = c("CHR", "BP"))
sumstat5_biallelic <- sumstat5_biallelic %>%
semi_join(out2, by = c("CHR", "BP"))
# test <- sumstat5_biallelic %>%
# filter(str_length(Allele1.x) == 1,
# str_length(Allele1.y) == 1)
#
# test2 <- sumstat5_biallelic %>%
# filter(!(C_B.x %in% test$C_B.x))
#
# test3 <- test2 %>%
# filter(!is.na(Allele1.x)) # This just removed all of the SNPs without SNP_ID column
# sum(sumstat5_biallelic$SNP1.x != sumstat5_biallelic$SNP2.y) # only 5 SNPs don't match by name
test <- sumstat5_biallelic %>%
filter(sumstat5_biallelic$SNP1.x != sumstat5_biallelic$SNP2.y) # 4 are multiallelic, one is an indel (looked it up in dbSNP)
extra_multiallelic1 <- test %>%
slice(-2)
#sum(sumstat5_biallelic$Allele2.x != sumstat5_biallelic$Allele2.y, na.rm = T) # 878 don't match
test <- sumstat5_biallelic %>%
filter(sumstat5_biallelic$Allele2.x != sumstat5_biallelic$Allele2.y) # these are all mismatched alleles (i.e. multi-allellic)
extra_multiallelic2 <- sumstat5_biallelic %>%
filter((Allele1.x == Allele1.y & Allele2.x != Allele2.y) | (Allele1.x != Allele1.y & Allele2.x == Allele2.y))
sumstat5_biallelic1 <- sumstat5_biallelic %>%
filter(is.na(Allele1.x),
!(SNP %in% extra_multiallelic2$SNP)) %>%
rename(Allele1 = Allele1.y,
Allele2 = Allele2.y,
Effect_Allele = A1) %>%
select(-Allele1.x, -Allele2.x)
sumstat5_biallelic2 <- sumstat5_biallelic %>%
filter(Allele1.x == Allele1.y,
Allele2.x == Allele2.y,
!(SNP %in% extra_multiallelic2$SNP)) %>%
rename(Allele1 = Allele1.x,
Allele2 = Allele2.x,
Effect_Allele = A1) %>%
select(-Allele1.y, -Allele2.y)
# test <- sumstat5_biallelic %>%
# filter(!(SNP %in% sumstat5_biallelic1$SNP) & !(SNP %in% sumstat5_biallelic2$SNP))
sumstat5_biallelic3 <- sumstat5_biallelic2 %>%
select(CHR:SNP_ID, C_B.x:SNP2.y, Allele1, Allele2, MAF:C_B.y) %>%
rbind(sumstat5_biallelic1) %>%
arrange(CHR, BP)
test <- sumstat5_biallelic3 %>%
filter(Minor_Allele != Effect_Allele)
#summary(test$MAF) # all really close to 0.5 MAF, just need to flip OR, L95, and U95 then set Effect_Allele to Minor_Allele column
test <- test %>%
mutate(OR = 1/OR,
tmp = 1/L95,
tmp2 = 1/U95,
L95 = tmp2,
U95 = tmp,
Effect_Allele = Minor_Allele) %>%
rename(EAF = MAF) %>%
select(-tmp, -tmp2, -Minor_Allele)
sumstat5_biallelic4 <- sumstat5_biallelic3 %>%
filter(Minor_Allele == Effect_Allele) %>%
select(-Minor_Allele) %>%
rename(EAF = MAF) %>%
rbind(test) %>%
arrange(CHR, BP)
test <- sumstat5_biallelic4 %>%
filter(Allele2 == Effect_Allele) %>%
rename(Alternate_Allele = Allele1) %>%
select(CHR:Effect_Allele, Alternate_Allele, TEST:SNP2.y, EAF:C_B.y)
test2 <- sumstat5_biallelic4 %>%
filter(Allele1 == Effect_Allele) %>%
rename(Alternate_Allele = Allele2) %>%
select(CHR:Effect_Allele, Alternate_Allele, TEST:SNP2.y, EAF:C_B.y)
sumstat5_biallelic5 <- rbind(test, test2) %>%
arrange(CHR, BP)
biallelic_sumstat_final <- sumstat5_biallelic5
# tmp <- biallelic_sumstat_final %>%
# filter(C_B.x != C_B.y)
#
# tmp <- biallelic_sumstat_final %>%
# filter(SNP1.x != SNP2.y) # Only non-matching site was an indel (can remove)
#
# tmp <- biallelic_sumstat_final %>%
# filter(SNP2.x != SNP1.y) # This includes the non-matching site from above and every other site is just wrong in the SNP1.y column (but they are the same SNP) => keep SNP2.x
biallelic_sumstat_final <- biallelic_sumstat_final %>%
filter(SNP1.x == SNP2.y) %>%
select(-C_B.x, -C_B.y, -SNP2.y, -SNP1.y) %>%
rename(SNP1 = SNP1.x,
SNP2 = SNP2.x)
# Removing variants that aren't in the directly genotyped CoreEx data (for Polynesian analysis)
# snplist_plink2 <- biallelic_sumstat_final %>%
# mutate(BP2 = BP) %>%
# select(CHR, BP, BP2, SNP)
#write_delim(snplist_plink2, delim = " ", file = here("Output/Temp/snplist_plink2.txt"), col_names = F)
# First filtering CoreEx files to only include individuals of interest and SNPs with a callrate of 95%
CoreExPheno <- read_delim(here("Data/Phenotypes/CZ-MB1.2-QC1.10_MergedPhenotypes_20082020.txt"), delim = "\t")
All_Euro_ID <- read_delim(here("Output/Temp/merged_PRS_UKBB.fam"), delim = " ", col_names = F)
CoreExPheno_Euro <- CoreExPheno %>%
filter(Geno.BroadAncestry == "European",
Geno.SampleID %in% All_Euro_ID$X2,
General.Use != "No",
!(Pheno.Study %in% c("Auckland Controls", "Australian Controls", "ESR", "Rheumatoid Arthritis")))
All_CoreEx_ID <- read_delim("/Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted.fam", delim = " ", col_names = F)
CoreExPheno_Poly <- CoreExPheno %>%
filter(Geno.BroadAncestry == "Oceanian",
Geno.SampleID %in% All_CoreEx_ID$X2,
General.Use != "No",
!(Pheno.Study %in% c("ESR", "Pacific Trust")),
!is.na(Pheno.GoutSummary))
rm(CoreExPheno, All_CoreEx_ID, All_Euro_ID)
all_coreex_ids <- rbind(CoreExPheno_Euro, CoreExPheno_Poly) %>%
select(Geno.FamilyID, Geno.SampleID)
#write_delim(all_coreex_ids, delim = "\t", file = here("Output/Temp/all_coreex_ids.txt"), col_names = F)
#system(paste0("source ~/.bashrc; plink1.9b4.9 --bfile /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted --keep ", here("Output/Temp/all_coreex_ids.txt"), " --extract range ", here("Output/Temp/snplist_plink2.txt"), " --geno 0.05 --make-bed --out ", here("Output/Temp/inCoreExGeno")))
geno <- read_delim(here("Output/Temp", "inCoreExGeno.bim"), delim = "\t", col_names = FALSE) %>%
mutate(CHR_BP = paste0(X1, "_", X4))
biallelic_sumstat_final_poly <- biallelic_sumstat_final %>%
mutate(CHR_BP = paste0(CHR, "_", BP)) %>%
filter(CHR_BP %in% geno$CHR_BP)
rm(geno, snplist_plink2)
save(biallelic_sumstat_final, file = here("Output/Temp", "biallelic_sumstat_final.RData"))
save(biallelic_sumstat_final_poly, file = here("Output/Temp", "biallelic_sumstat_final_poly.RData"))
}
# Dealing with multi-allelic SNPs (leave for later)
if(file.exists(here("Output/Temp/multi_allelic_for_later.RData"))) {
#load(here("Output/Temp/multi_allelic_for_later.RData"))
} else {
tmp <- biallelic_sumstat_final %>%
mutate(C_B = paste0(CHR, "_", BP))
sumstat5_multi <- sumstat5 %>%
filter(!(C_B %in% tmp$C_B)) # 38,991 rows contain multi-allelic sites
out <- tibble()
for(i in 1:22) {
assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("ukb_maf_info_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
out <- rbind(out, get(paste0("chr", i, "_snps")))
rm(list = paste0("chr", i, "_snps"), i)
}
colnames(out) <- c("SNP1", "SNP2", "BP", "Allele1", "Allele2", "MAF", "Minor_Allele", "INFO", "CHR")
out <- out %>%
mutate(C_B = paste0(CHR, "_", BP)) %>%
filter(str_length(Allele1) == 1,
str_length(Allele2) == 1,
MAF > 0.01,
MAF < 0.99,
INFO > 0.3)
out2 <- out %>%
filter(C_B %in% sumstat5_multi$C_B) %>%
arrange(CHR, BP) # 26,235 of the 38,991 chr/bp locations match
sumstat5_multi <- sumstat5_multi %>%
left_join(out2, by = c("CHR", "BP"))
# tmp <- sumstat5_multi %>%
# unique() # all unique
# tmp <- sumstat5_multi %>%
# filter(duplicated(C_B.x))
#
# test <- sumstat5_multi %>%
# filter(C_B.x %in% tmp$C_B.x) # 27,909 rows contain duplicated C_B.x
save(sumstat5_multi, file = here("Output/Temp", "multi_allelic_for_later.RData"))
}
To get from these cleaned up summary statistics to the final list of SNPs for the European PRS, we did the following:
We filtered out all SNPs with P-values greater than 5e-8.
We took each lead SNP within a 1 Mb window and used these to define 23 crude loci.
This list of lead SNPs was further filtered to only include one lead SNP per full locus.
Next, SNPs in the UK Biobank BGEN files were extracted if they fit within the boundaries of these “full loci”.
Conditional GWAS were run at each locus, conditioning on the lead SNP.
If there was a significant SNP (P < 5e-8) remaining after conditioning, the original lead SNP and the new lead SNP were used for a subsequent conditional GWAS at this locus.
This was repeated until no more significant SNPs (P < 5e-8) remained at each locus.
Locus zooms were plotted for each locus, using both the unconditioned and conditioned GWAS results.
Finally, the resulting list of 27 lead SNPs were saved in a single file ready for conversion to a PRS.
# Defining one SNP per locus ---------------------------------------------------------------------------
# First, filter out P < 5e-8 SNPs and arrange by P
if(file.exists(here("Output/Temp/biallelic_signif.RData"))) {
load(here("Output/Temp/biallelic_signif.RData"))
} else {
biallelic_signif <- biallelic_sumstat_final %>%
filter(P <= 5e-8) %>%
arrange(P)
save(biallelic_signif, file = here("Output/Temp/biallelic_signif.RData"))
}
# Grouping into loci +- 500 kb of top SNPs
gout_top <- biallelic_signif %>%
slice(1)
gout2 <- biallelic_signif %>%
filter(!(CHR == gout_top$CHR[1] & BP %in% ((gout_top$BP[1] - 500000):(gout_top$BP[1] + 500000))))
while(nrow(gout2) > 0) {
tmp <- gout2 %>%
slice(1)
gout_top <- rbind(tmp, gout_top)
gout2 <- gout2 %>%
filter(!(CHR == gout_top$CHR[1] & BP %in% ((gout_top$BP[1] - 500000):(gout_top$BP[1] + 500000))))
}
gout_top <- gout_top %>%
arrange(CHR, BP)
# Finding regions of loci
biallelic_signif <- biallelic_signif %>%
arrange(CHR, BP)
out <- NA
for(i in 2:nrow(biallelic_signif)) {
if(biallelic_signif$CHR[i] == biallelic_signif$CHR[i - 1]){
out[i] <- biallelic_signif$BP[i] - biallelic_signif$BP[i - 1]
} else {
out[i] <- NA
}
}
tmp <- biallelic_signif %>%
mutate(Diff = out,
Diff2 = case_when(Diff < 500000 ~ Diff))
out <- biallelic_signif %>% slice(1)
for(i in 2:nrow(biallelic_signif)) {
if(is.na(tmp$Diff2[i])){
out <- rbind(out, biallelic_signif %>% slice(i - 1), biallelic_signif %>% slice(i))
}
}
out <- rbind(out, biallelic_signif %>% slice(nrow(biallelic_signif)))
# Extracting regions
bgen_ranges <- out %>% select(CHR, BP)
tmp1 <- bgen_ranges %>% slice(seq(1, nrow(bgen_ranges), by = 2)) %>% rename(BP1 = BP)
tmp2 <- bgen_ranges %>% slice(seq(2, nrow(bgen_ranges), by = 2)) %>% rename(CHR.x = CHR, BP2 = BP)
bgen_ranges <- tmp1 %>%
cbind(tmp2) %>%
mutate(BP1 = BP1 - 50000,
BP2 = BP2 + 50000) %>%
select(-CHR.x)
bgen_range1 <- bgen_ranges %>%
filter(CHR < 10) %>%
mutate(BGEN = paste0("0", CHR, ":", BP1, "-", BP2))
bgen_range2 <- bgen_ranges %>%
filter(CHR > 9) %>%
mutate(BGEN = paste0(CHR, ":", BP1, "-", BP2))
bgen_ranges <- rbind(bgen_range1, bgen_range2) %>%
arrange(CHR, BP1)
tmp <- bgen_ranges %>%
select(BGEN)
rm(bgen_range1, bgen_range2, tmp1, tmp2, out, gout2, i)
#write_delim(tmp, file = here("Output/Temp", "bgen_range.txt"), delim = "\n", col_names = F)
# Extracting all SNPs from biallelic sumstat that fit within boundaries of loci
if(file.exists(here("Output/Temp/biallelic_loci.RData"))) {
load(here("Output/Temp/biallelic_loci.RData"))
} else {
biallelic_loci <- tibble()
for(i in 1:nrow(bgen_ranges)){
tmp <- biallelic_sumstat_final %>%
filter(CHR == bgen_ranges$CHR[i] & between(BP, bgen_ranges$BP1[i], bgen_ranges$BP2[i]))
biallelic_loci <- rbind(biallelic_loci, tmp)
}
biallelic_loci <- biallelic_loci %>%
mutate(SNP_ID2 = paste0(CHR, "_", BP, "_", Alternate_Allele, "_", Effect_Allele))
#save(biallelic_loci, file = here("Output/Temp/biallelic_loci.RData"))
}
tmp <- biallelic_loci %>%
mutate(BP2 = BP) %>%
select(CHR, BP, BP2, SNP)
#write_delim(tmp, file = here("Output/Temp", "biallelic_loci_snps.txt"), delim = "\t", col_names = F)
out <- c()
for(i in 1:nrow(bgen_ranges)){
tmp <- gout_top %>%
filter(CHR == bgen_ranges$CHR[i] & between(BP, bgen_ranges$BP1[i], bgen_ranges$BP2[i])) %>%
arrange(P) %>%
slice(1)
out <- rbind(out, tmp)
}
gout_top <- out %>%
cbind(bgen_ranges %>% select(-CHR))
rm(tmp, bgen_ranges, out, i)
# Extracting all SNPs at loci from bgen files and converting to plink format -------------------------------
#system(paste0('source ~/.bashrc; parallel "bgenix -g /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/ukb_imp_chr{}_v3.bgen -vcf -incl-range ', here("Output/Temp", "bgen_range.txt"), ' | bcftools reheader -h /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/bgen_to_vcf/new_header.txt | bcftools annotate --rename-chrs /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/bgen_to_vcf/rename_contigs.txt | bgzip -c > ', here("Output/Temp", "chr"), '{}_forclumping.vcf.gz" ::: ', paste(unique(gout_top$CHR), collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b4.9 --vcf ', here("Output/Temp/"), 'chr{}_forclumping.vcf.gz --extract range ', here("Output/Temp/biallelic_loci_snps.txt"), ' --pheno ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --pheno-name plink_goutaff --update-sex ', here("Data/GWAS", "gout_gwas_keep_ids_w_sex.txt"), ' --geno 0.1 --maf 0.01 --hwe 0.000001 --make-bed --out ', here("Output/Temp/"), 'chr{}_tmp" ::: ', paste(unique(gout_top$CHR), collapse = " ")))
# Reading the bim files into R and converting their identifiers to just the rsid
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), "_tmp.bim")]
for(i in file_names){
assign(i, read_delim(paste0(here("Output/Temp/"), i), delim = "\t", col_names = F))
assign(i, get(i) %>% left_join(biallelic_loci, (by = c("X1" = "CHR", "X4" = "BP"))) %>% mutate(SNP_clean = case_when(is.na(RSID) ~ SNP_ID2, TRUE ~ RSID)))
# assign(paste0(i, "_notequal"), get(i) %>% filter(X2 != SNP)) # all identical
assign(paste0("new_", i), get(i) %>% select(X1, SNP_clean, X3:X6))
#write_delim(get(paste0("new_", i)), file = paste0(here("Output/Temp/"), i), delim = "\t", col_names = F)
}
rm(list = ls()[str_detect(ls(), ".bim")], i, file_names)
# Running the conditional GWAS ----------------------------------------
# Split up the plink files to have one locus per file (saves on computational time)
gout_top2 <- gout_top %>%
select(CHR, BP1, BP2, RSID)
for(i in 1:nrow(gout_top2)){
tmp <- gout_top2 %>% slice(i)
#write_delim(tmp, file = paste0(here("Output/Temp/"), "extractrange_", tmp$RSID, ".txt"), delim = "\t", col_names = F)
}
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile {1}/Output/Temp/chr{2}_tmp --extract range {1}/Output/Temp/extractrange_{3}.txt --make-bed --out {1}/Output/Temp/{3}" ::: ', paste(rep(here(), nrow(gout_top)), collapse = " "), ' ::: ', paste(gout_top$CHR, collapse = " "), ' ::: ', paste(gout_top$RSID, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "cat {1}/Output/Temp/{2}.bim | cut -f 2 > {1}/Output/Temp/{2}_snps; split -d -n l/10 {1}/Output/Temp/{2}_snps {1}/Output/Temp/{2}_snps_split" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " ")))
# First round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {2} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition {2} --out {1}/Output/Temp/{2}_split{3}" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
# Reading all the outputs into R
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top$RSID[i], "_split0", j, ".assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top$RSID[i], "_gwas"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead = SNP, new_p = P)
gout_top2 <- gout_top %>%
cbind(tmp)
gout_top_resid <- gout_top2 %>%
filter(new_p < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Second round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid$new_lead, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}'"), ' > {1}/Output/Temp/{2}_2" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(gout_top_resid$new_lead, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_2 --out {1}/Output/Temp/{2}_split{3}_2" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_2.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top_resid)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top_resid$RSID[i], "_split0", j, "_2.assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top_resid$RSID[i], "_gwas2"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead2 = SNP, new_p2 = P)
gout_top3 <- gout_top_resid %>%
cbind(tmp)
gout_top_resid2 <- gout_top3 %>%
filter(new_p2 < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Third round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead2, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}\n{4}'"), ' > {1}/Output/Temp/{2}_3" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead, collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead2, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_3 --out {1}/Output/Temp/{2}_split{3}_3" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_3.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top_resid2)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top_resid2$RSID[i], "_split0", j, "_3.assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top_resid2$RSID[i], "_gwas3"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead3 = SNP, new_p3 = P)
gout_top4 <- gout_top_resid2 %>%
cbind(tmp)
gout_top_resid3 <- gout_top4 %>%
filter(new_p3 < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Fourth round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid3$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead3, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}\n{4}\n{5}'"), ' > {1}/Output/Temp/{2}_4" ::: ', here(), ' ::: ', paste(gout_top_resid3$RSID, collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead, collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead2, collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead3, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_4 --out {1}/Output/Temp/{2}_split{3}_4" ::: ', here(), ' ::: ', paste(gout_top_resid3$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_4.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top_resid3)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top_resid3$RSID[i], "_split0", j, "_4.assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top_resid3$RSID[i], "_gwas4"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead4 = SNP, new_p4 = P)
gout_top5 <- gout_top_resid3 %>%
cbind(tmp)
gout_top_resid4 <- gout_top5 %>%
filter(new_p4 < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Fifth round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid4$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead4, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}\n{4}\n{5}\n{6}'"), ' > {1}/Output/Temp/{2}_5" ::: ', here(), ' ::: ', paste(gout_top_resid4$RSID, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead2, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead3, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead4, collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_5 --out {1}/Output/Temp/{2}_split{3}_5" ::: ', here(), ' ::: ', paste(gout_top_resid4$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_5.assoc.logistic"))]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:nrow(gout_top_resid4)){
tmp3 <- c()
for(j in 0:9){
tmp2 <- get(paste0(gout_top_resid4$RSID[i], "_split0", j, "_5.assoc.logistic"))
tmp3 <- rbind(tmp3, tmp2)
}
assign(paste0(gout_top_resid4$RSID[i], "_gwas5"), tmp3 %>% na.omit())
tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>%
rename(new_lead5 = SNP, new_p5 = P)
gout_top6 <- gout_top_resid4 %>%
cbind(tmp)
gout_top_resid5 <- gout_top6 %>%
filter(new_p5 < 5e-8)
rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
# Locus zooms ---------------------------------------
# Loading in code and gene list
source(here("Script/Functions/locus_zoom.R"))
UCSC_GRCh37_Genes_UniqueList.txt <- as.data.frame(read_delim(here("Data/GWAS/UCSC_GRCh37_Genes_UniqueList.txt"), delim = "\t"))
# Plotting locus zooms of original GWAS
# Calculating LD
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top$CHR, collapse = " "), ' ::: ', paste(gout_top$RSID, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top)){
assign(paste0("chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld.ld")))
}
# Making full list of SNPs for labelling
first_round <- gout_top_resid %>%
select(new_lead) %>%
rename(RSID = new_lead)
second_round <- gout_top_resid2 %>%
select(new_lead2) %>%
rename(RSID = new_lead2)
third_round <- gout_top_resid3 %>%
select(new_lead3) %>%
rename(RSID = new_lead3)
fourth_round <- gout_top_resid4 %>%
select(new_lead4) %>%
rename(RSID = new_lead4)
gout_top_full <- gout_top %>%
select(RSID) %>%
rbind(first_round, second_round, third_round, fourth_round) %>%
left_join(biallelic_loci, by = "RSID") %>%
arrange(CHR, BP)
# Plotting the locus zooms
for(i in 1:nrow(gout_top)){
locus.zoom(data = biallelic_loci %>% mutate(SNP = RSID) %>% filter(!is.na(SNP), CHR == gout_top$CHR[i] & between(BP, gout_top$BP1[i], gout_top$BP2[i])),
region = c(gout_top$CHR[i], gout_top$BP1[i], gout_top$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Unconditioned ", gout_top$RSID[i], " Locus Zoom"),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top$CHR[i], "_", gout_top$BP1[i], "_", gout_top$BP2[i], "_", gout_top$RSID[i], "_unconditioned", ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of first round of conditioning
# Remaking LD based on ALL new lead SNPs
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top2$CHR, collapse = " "), ' ::: ', paste(gout_top2$new_lead, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top2)){
assign(paste0("chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld.ld")))
}
# Plotting locus zooms
for(i in 1:nrow(gout_top2)){
locus.zoom(data = get(paste0(gout_top2$RSID[i], "_gwas")),
region = c(gout_top$CHR[i], gout_top2$BP1[i], gout_top2$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top2$RSID[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top2$CHR[i], "_", gout_top2$BP1[i], "_", gout_top2$BP2[i], "_", gout_top2$RSID[i], "_condition_", gout_top2$RSID[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of second round of conditioning
# Remaking LD based on new lead SNPs
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top3$CHR, collapse = " "), ' ::: ', paste(gout_top3$new_lead2, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top3)){
assign(paste0("chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld.ld")))
}
# Plotting locus zooms
for(i in 1:nrow(gout_top3)){
locus.zoom(data = get(paste0(gout_top3$RSID[i], "_gwas2")),
region = c(gout_top3$CHR[i], gout_top3$BP1[i], gout_top3$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top3$RSID[i], " and ", gout_top3$new_lead[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top3$CHR[i], "_", gout_top3$BP1[i], "_", gout_top3$BP2[i], "_", gout_top3$RSID[i], "_condition_", gout_top3$RSID[i], "and", gout_top3$new_lead[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of third round of conditioning
# Remaking LD based on new lead SNP
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top4$CHR, collapse = " "), ' ::: ', paste(gout_top4$new_lead3, collapse = " ")))
# Reading the LD reports back into R
for(i in 1:nrow(gout_top4)){
assign(paste0("chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld.ld")))
}
# Plotting locus zooms
for(i in 1:nrow(gout_top4)){
locus.zoom(data = get(paste0(gout_top4$RSID[i], "_gwas3")),
region = c(gout_top4$CHR[i], gout_top4$BP1[i], gout_top4$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top4$RSID[i], " and ", gout_top4$new_lead[i], " and ", gout_top4$new_lead2[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top4$CHR[i], "_", gout_top4$BP1[i], "_", gout_top4$BP2[i], "_", gout_top4$RSID[i], "_condition_", gout_top4$RSID[i], "and", gout_top4$new_lead[i], "and", gout_top4$new_lead2[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of fourth round of conditioning
# Remaking LD based on new lead SNP
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top5$CHR, collapse = " "), ' ::: ', paste(gout_top5$new_lead4, collapse = " ")))
# Next I need to read the ld reports back into R
for(i in 1:nrow(gout_top5)){
assign(paste0("chr", gout_top5$CHR[i], "_", gout_top5$new_lead4[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top5$CHR[i], "_", gout_top5$new_lead4[i], "_ld.ld")))
}
for(i in 1:nrow(gout_top5)){
locus.zoom(data = get(paste0(gout_top5$RSID[i], "_gwas4")),
region = c(gout_top5$CHR[i], gout_top5$BP1[i], gout_top5$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top5$CHR[i], "_", gout_top5$new_lead4[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top5$RSID[i], " and ", gout_top5$new_lead[i], " and ", gout_top5$new_lead2[i], " and ", gout_top5$new_lead3[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top5$CHR[i], "_", gout_top5$BP1[i], "_", gout_top5$BP2[i], "_", gout_top5$RSID[i], "_condition_", gout_top5$RSID[i], "and", gout_top5$new_lead[i], "and", gout_top5$new_lead2[i], "and", gout_top5$new_lead3[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Plotting locus zooms of fifth round of conditioning
# Remaking LD based on new lead SNP
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top6$CHR, collapse = " "), ' ::: ', paste(gout_top6$new_lead5, collapse = " ")))
# Next I need to read the ld reports back into R
for(i in 1:nrow(gout_top6)){
assign(paste0("chr", gout_top6$CHR[i], "_", gout_top6$new_lead4[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top6$CHR[i], "_", gout_top6$new_lead4[i], "_ld.ld")))
}
for(i in 1:nrow(gout_top6)){
locus.zoom(data = get(paste0(gout_top6$RSID[i], "_gwas5")),
region = c(gout_top6$CHR[i], gout_top6$BP1[i], gout_top6$BP2[i]),
offset_bp = 0,
ld.file = get(paste0("chr", gout_top6$CHR[i], "_", gout_top6$new_lead4[i], "_ld")),
genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
plot.title = paste0("Conditioned on ", gout_top6$RSID[i], " and ", gout_top6$new_lead[i], " and ", gout_top6$new_lead2[i], " and ", gout_top6$new_lead3[i]),
file.name = paste0(here("Output/Plots/"), "Chr", gout_top6$CHR[i], "_", gout_top6$BP1[i], "_", gout_top6$BP2[i], "_", gout_top6$RSID[i], "_condition_", gout_top6$RSID[i], "and", gout_top6$new_lead[i], "and", gout_top6$new_lead2[i], "and", gout_top6$new_lead3[i], ".jpg"),
secondary.snp = gout_top_full$RSID,
secondary.label = TRUE)
}
# Combining all GWAS results together into final list ----------------------------------------------
regions <- gout_top %>%
select(CHR, BP1, BP2)
out <- c()
for(i in 1:nrow(regions)){
tmp <- gout_top_full %>%
filter(CHR == regions$CHR[i] & between(BP, regions$BP1[i], regions$BP2[i])) %>%
mutate(BP1 = regions$BP1[i],
BP2 = regions$BP2[i])
out <- rbind(out, tmp)
}
gout_top_full <- out
# For each of the loci with multiple SNPs, I need to test the association of each SNP after adjusting for all others at the locus
# So first lets pull out all SNPs from those loci
multi_snps <- gout_top_full %>%
filter(BP1 %in% names(table(gout_top_full$BP1)[table(gout_top_full$BP1) > 1]))
tmp <- multi_snps %>% select(RSID)
#write_delim(tmp, file = here("Output/Temp/snps_to_extract.txt"), delim = "\n", col_names = F)
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --extract ', here("Output/Temp/snps_to_extract.txt"), ' --make-bed --out ', here("Output/Temp/"), 'chr{1}_test" ::: ', paste(unique(multi_snps$CHR), collapse = " ")))
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr4_test --bmerge ', here("Output/Temp/"), 'chr11_test --make-bed --out ', here("Output/Temp/"), 'merged_test'))
for(i in 6:9){
system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_test --logistic sex --ci 0.95 --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition ', multi_snps$RSID[i], ' --out ', here("Output/Temp/"), 'final_gwas_', multi_snps$RSID[i]))
}
for(i in 1:5){
write_delim(multi_snps %>% slice(1:5) %>% select(RSID) %>% filter(RSID != multi_snps$RSID[i]), file = paste0(here("Output/Temp/"), "conditionlist_", multi_snps$RSID[i], ".txt"), delim = "\n", col_names = F)
system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_test --logistic sex --ci 0.95 --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition-list ', here("Output/Temp/"), 'conditionlist_', multi_snps$RSID[i], '.txt --out ', here("Output/Temp/"), 'final_gwas_', multi_snps$RSID[i]))
}
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), "final_gwas_.+logistic")]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:5){
tmp2 <- get(paste0("final_gwas_", multi_snps$RSID[i], ".assoc.logistic")) %>% slice(i)
tmp <- rbind(tmp, tmp2)
}
tmp2 <- c()
for(i in 6:9){
tmp3 <- get(paste0("final_gwas_", multi_snps$RSID[i], ".assoc.logistic")) %>% slice(-(1:5))
tmp2 <- rbind(tmp2, tmp3)
}
tmp2 <- tmp2 %>% slice(5, 2, 15, 12)
multi_snps2 <- rbind(tmp, tmp2)
tmp2 <- multi_snps2 %>%
select(CHR:BP, OR:U95, P)
multi_snps3 <- left_join(multi_snps, tmp2, by = c("CHR", "BP"))
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_test --r2 inter-chr --ld-window-r2 0 --out ', here("Output/Temp/"), 'merged_ld'))
merged_ld <- read_table(paste0(here("Output/Temp/"), "merged_ld.ld"))
# test multicollinearity in SLC2A9 model in next Rmd
single_snps <- gout_top_full %>%
filter(!(BP1 %in% names(table(gout_top_full$BP1)[table(gout_top_full$BP1) > 1])))
single_snps2 <- single_snps %>%
select(CHR, RSID, BP:Alternate_Allele, OR:U95, P, EAF, INFO, BP1, BP2)
multi_snps4 <- multi_snps3 %>%
select(CHR, RSID, BP:Alternate_Allele, OR.y:U95.y, P.y, OR.x:U95.x, P.x, EAF, INFO, BP1, BP2) %>%
rename(OR = OR.y,
SE = SE.y,
L95 = L95.y,
U95 = U95.y,
P = P.y,
OR_old = OR.x,
SE_old = SE.x,
L95_old = L95.x,
U95_old = U95.x,
P_old = P.x)
gout_top_final <- full_join(multi_snps4, single_snps2) %>%
arrange(CHR, BP)
# Flipping allele order + OR etc so effect allele is always the gout risk allele + labelling based on locus zooms
smallOR <- gout_top_final %>%
filter(OR < 1) %>%
mutate(OR = as.numeric(signif(1/OR, digits = 4)),
tmp_L = as.numeric(signif(1/L95, digits = 4)),
tmp_U = as.numeric(signif(1/U95, digits = 4)),
U95 = tmp_L,
L95 = tmp_U,
OR_old = as.numeric(round(1/OR_old, digits = 3)),
tmp_L_old = as.numeric(round(1/L95_old, digits = 3)),
tmp_U_old = as.numeric(round(1/U95_old, digits = 3)),
U95_old = tmp_L_old,
L95_old = tmp_U_old,
EAF = 1 - EAF) %>%
rename(allele2 = Effect_Allele,
allele1 = Alternate_Allele) %>%
rename(Alternate_Allele = allele2,
Effect_Allele = allele1) %>%
select(CHR:BP, Effect_Allele, Alternate_Allele, OR:BP2)
bigOR <- gout_top_final %>%
filter(OR > 1)
gout_top_final <- full_join(smallOR, bigOR) %>%
arrange(CHR, BP) %>%
mutate(Locus_Name = c("ARID1A", "PDZK1", "TRIM46", "GCKR", "SFMBT1", "SLC2A9", "SLC2A9", "SLC2A9", "SLC2A9", "SLC2A9", "ABCG2", "ABCG2", "ADH1B", "TMEM171", "RREB1", "SLC17A1", "MLXIPL", "SLC16A9", "ANO3", "SLC22A11", "SLC22A11", "OVOL1", "R3HDM2", "MLXIP", "NFAT5", "INSR", "PNPLA3"))
UKBB_Gene_OR <- gout_top_final
save(UKBB_Gene_OR, file = here("Output/UKBB_Gene_OR.RData"))
# Cleaning up
rm(list = ls()[str_detect(ls(), "^chr|^gout_top|^final_|gwas.$")], bigOR, first_round, multi_snps, multi_snps2, multi_snps3, out, regions, second_round, single_snps, smallOR, third_round, tmp, tmp2, tmp3, file_names, file_names2, i)
The European PRS was then further filtered for the Polynesian analysis as follows:
Biallelic SNPs from the imputed UK Biobank were filtered to only include those directly genotyped in the CoreExome (~ 250,000).
This list of SNPs was overlapped with the final SNP list for the European analysis to find those SNPs directly genotyped in the CoreExome (3 / 27).
For any SNP that was not directly genotyped, a full LD report was produced with candidate proxy SNPs based on the ~250,000 SNPs above.
The SNP with the highest R-squared for each SNP in the European PRS was extracted, then this list was filtered to only include those SNPs that met P < 5e-8 in the full UK Biobank gout GWAS (15 / 24).
The 3 directly genotyped SNPs were combined with these 15 proxies to produce a single file of 18 lead SNPs for the Polynesian analysis.
# Load in SNPs for inclusion in Polynesian analysis
load(here("Output/Temp/biallelic_sumstat_final_poly.RData"))
load(here("Output/UKBB_Gene_OR.RData"))
# Next I need to test the 27 lead SNPs from the European analysis for LD with this list of 243,985 variants in order to find the best proxies
# To make this more efficient, I should initially filter the 243,985 SNP list based on the locus boundaries for the Euro PRS
biallelic_loci_poly <- tibble()
tmp_loci <- UKBB_Gene_OR %>%
select(CHR, BP1, BP2) %>%
unique()
for(i in 1:nrow(tmp_loci)){
tmp <- biallelic_sumstat_final_poly %>%
filter(CHR == tmp_loci$CHR[i] & between(BP, tmp_loci$BP1[i], tmp_loci$BP2[i]))
biallelic_loci_poly <- rbind(biallelic_loci_poly, tmp)
}
out <- c()
for(i in 1:nrow(UKBB_Gene_OR)){
test <- biallelic_loci_poly %>%
filter(CHR == UKBB_Gene_OR$CHR[i] & BP == UKBB_Gene_OR$BP[i])
out <- rbind(out, test)
}
need_proxies <- UKBB_Gene_OR %>%
filter(!(BP %in% out$BP))
#write_delim(need_proxies %>% select(RSID), file = here("Output/Temp/need_proxies.txt"), delim = "\t", col_names = F)
biallelic_loci_poly2 <- tibble()
tmp_loci <- need_proxies %>%
select(CHR, BP1, BP2) %>%
unique()
for(i in 1:nrow(tmp_loci)){
tmp <- biallelic_sumstat_final_poly %>%
filter(CHR == tmp_loci$CHR[i] & between(BP, tmp_loci$BP1[i], tmp_loci$BP2[i]))
biallelic_loci_poly2 <- rbind(biallelic_loci_poly2, tmp)
}
tmp <- biallelic_loci_poly2 %>%
mutate(BP1 = BP) %>%
select(CHR, BP, BP1, RSID)
tmp2 <- need_proxies %>%
mutate(BP1 = BP) %>%
select(CHR, BP, BP1, RSID)
need_ld <- rbind(tmp, tmp2) %>%
arrange(CHR, BP)
# write_delim(need_ld, file = here("Output/Temp/need_ld.txt"), delim = "\t", col_names = F)
# Now I need to make a single UK Biobank plink file with all SNPs above and test LD between the 25 SNPs of interest and all others
# For each SNP, I will then pull out the best proxy and filter the final list for proxies with at least 0.8 R-squared with the lead SNP - either that or based on whether they associate with gout
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --extract range ', here("Output/Temp/need_ld.txt"), ' --make-bed --out ', here("Output/Temp/"), 'chr{1}_poly" ::: ', paste(unique(need_ld$CHR), collapse = " ")))
#write_delim(as_tibble(paste0(here("Output/Temp/"), "chr", unique(need_ld$CHR), "_poly")), file = here("Output/Temp/mergefile.txt"), delim = "\n", col_names = F)
#system(paste0('source ~/.bashrc; plink1.9b6.10 --merge-list ', here("Output/Temp/"), 'mergefile.txt --make-bed --out ', here("Output/Temp/"), 'merged_poly'))
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_poly --r2 inter-chr --ld-snp-list ', here("Output/Temp/"), 'need_proxies.txt --ld-window-r2 0 --out ', here("Output/Temp/"), 'merged_poly_ld'))
merged_ld <- read_table(paste0(here("Output/Temp/"), "merged_poly_ld.ld"))
out1 <- tibble()
for(i in need_proxies$RSID){
tmp <- merged_ld %>% filter(SNP_A == i & SNP_B != i) %>% arrange(desc(R2)) %>% slice(1)
out1 <- rbind(out1, tmp)
}
tmp <- biallelic_loci_poly %>%
filter(RSID %in% out1$SNP_B) # 1 SNP was the best proxy for two of the SLC2A9 SNPs
tmp2 <- tmp %>%
filter(P < 5e-8) # keeping only those that were significant (rather than keeping based on LD)
out2 <- out1 %>%
filter(SNP_B %in% tmp2$RSID)
out <- c()
for(i in 1:nrow(UKBB_Gene_OR)){
test <- biallelic_loci_poly %>%
filter(CHR == UKBB_Gene_OR$CHR[i] & BP == UKBB_Gene_OR$BP[i])
out <- rbind(out, test)
}
full_list <- rbind(out, tmp2) %>%
arrange(CHR, BP) %>%
select(CHR, RSID, BP:Alternate_Allele, OR:U95, P, EAF, INFO)
out1_1 <- out1 %>%
select(SNP_A, SNP_B, R2)
full_list2 <- full_list %>%
left_join(out1_1, by = c("RSID" = "SNP_B")) %>%
slice(-5) # removing the duplicated SLC2A9 SNP
tmp <- UKBB_Gene_OR %>%
filter(RSID %in% full_list2$SNP_A) %>%
select(CHR, BP, RSID, BP1:Locus_Name)
tmp2 <- UKBB_Gene_OR %>%
filter(RSID %in% out$RSID) %>%
select(CHR, BP, RSID, BP1:Locus_Name)
locus_names <- rbind(tmp, tmp2) %>%
arrange(CHR, BP) %>%
select(BP1:Locus_Name)
full_list3 <- full_list2 %>%
cbind(locus_names) %>%
rename(Old_Lead = SNP_A)
# For each of the loci with multiple SNPs, I need to test the association of each SNP after adjusting for all others at the locus
# So first lets pull out all SNPs from those loci
multi_snps <- full_list3 %>%
filter(BP1 %in% names(table(full_list3$BP1)[table(full_list3$BP1) > 1]))
tmp <- multi_snps %>% select(RSID)
#write_delim(tmp, file = here("Output/Temp/snps_to_extract_poly.txt"), delim = "\n", col_names = F)
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_poly --extract ', here("Output/Temp/snps_to_extract_poly.txt"), ' --make-bed --out ', here("Output/Temp/"), 'merged_poly_multi'))
for(i in 4:7){
system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_poly_multi --logistic sex --ci 0.95 --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition ', multi_snps$RSID[i], ' --out ', here("Output/Temp/"), 'final_gwas_poly_', multi_snps$RSID[i]))
}
for(i in 1:3){
write_delim(multi_snps %>% slice(1:3) %>% select(RSID) %>% filter(RSID != multi_snps$RSID[i]), file = paste0(here("Output/Temp/"), "conditionlist_poly_", multi_snps$RSID[i], ".txt"), delim = "\n", col_names = F)
system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_poly_multi --logistic sex --ci 0.95 --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition-list ', here("Output/Temp/"), 'conditionlist_poly_', multi_snps$RSID[i], '.txt --out ', here("Output/Temp/"), 'final_gwas_poly_', multi_snps$RSID[i]))
}
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), "final_gwas_poly_.+logistic")]
for(i in file_names){
assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}
tmp <- c()
for(i in 1:3){
tmp2 <- get(paste0("final_gwas_poly_", multi_snps$RSID[i], ".assoc.logistic")) %>% slice(i)
tmp <- rbind(tmp, tmp2)
}
tmp2 <- c()
for(i in 4:7){
tmp3 <- get(paste0("final_gwas_poly_", multi_snps$RSID[i], ".assoc.logistic")) %>% slice(-(1:3))
tmp2 <- rbind(tmp2, tmp3)
}
tmp2 <- tmp2 %>% slice(5, 2, 15, 12)
multi_snps2 <- rbind(tmp, tmp2)
tmp2 <- multi_snps2 %>%
select(CHR:BP, OR:U95, P)
multi_snps3 <- left_join(multi_snps, tmp2, by = c("CHR", "BP"))
single_snps <- full_list3 %>%
filter(!(BP1 %in% names(table(full_list3$BP1)[table(full_list3$BP1) > 1])))
single_snps2 <- single_snps %>%
select(CHR, RSID, BP:Alternate_Allele, OR:U95, P, EAF, INFO, Old_Lead, BP1, BP2, Locus_Name)
multi_snps4 <- multi_snps3 %>%
select(CHR, RSID, BP:Alternate_Allele, OR.y:U95.y, P.y, OR.x:U95.x, P.x, EAF, INFO, Old_Lead, BP1, BP2, Locus_Name) %>%
rename(OR = OR.y,
SE = SE.y,
L95 = L95.y,
U95 = U95.y,
P = P.y,
OR_old = OR.x,
SE_old = SE.x,
L95_old = L95.x,
U95_old = U95.x,
P_old = P.x)
full_list4 <- full_join(multi_snps4, single_snps2) %>%
arrange(CHR, BP)
smallOR <- full_list4 %>%
filter(OR < 1) %>%
mutate(OR = as.numeric(signif(1/OR, digits = 4)),
tmp_L = as.numeric(signif(1/L95, digits = 4)),
tmp_U = as.numeric(signif(1/U95, digits = 4)),
U95 = tmp_L,
L95 = tmp_U,
EAF = 1 - EAF) %>%
rename(allele2 = Effect_Allele,
allele1 = Alternate_Allele) %>%
rename(Alternate_Allele = allele2,
Effect_Allele = allele1) %>%
select(CHR:BP, Effect_Allele, Alternate_Allele, OR:Locus_Name)
bigOR <- full_list4 %>%
filter(OR > 1)
full_list4 <- full_join(smallOR, bigOR) %>%
arrange(CHR, BP)
Poly_Gene_OR <- full_list4
save(Poly_Gene_OR, file = here("Output/Poly_Gene_OR.RData"))
# Cleaning up
rm(biallelic_loci_poly, biallelic_loci_poly2, biallelic_sumstat_final_poly, bigOR, full_list, full_list2, full_list3, locus_names, merged_ld, need_ld, need_proxies, out, out1, out1_1, out2, smallOR, test, tmp, tmp_loci, tmp2, i)
All of the locus zooms are plotted below:
file_names <- list.files(here("Output/Plots"), full.names = T)
tmp <- file_names %>% as_tibble() %>% separate(value, sep = "_", into = c(NA, "X2", "BP1", NA, NA, "Cond", "CondSNPs")) %>%
rownames_to_column() %>% mutate(Cond1 = case_when(Cond == "unconditioned.jpg" ~ FALSE, TRUE ~ TRUE), BP1 = as.numeric(BP1), rowname = as.numeric(rowname)) %>% separate(X2, sep = "/", into = c(NA, NA, NA, NA, NA, NA, NA, "CHR")) %>% mutate(CHR = as.numeric(str_replace(CHR, "Chr", ""))) %>% arrange(CHR, BP1, Cond1, CondSNPs) %>% pull(rowname)
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 21 rows [2, 4, 6,
## 8, 10, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 34, 39, 41, 43, 45, ...].
file_names2 <- file_names[tmp]
include_graphics(file_names2)
The purpose of this document is to generate cleaned up phenotype files for each cohort, with the European and Polynesian polygenic risk scores (PRSs) included for each. It contains the code for going from the raw phenotype and genotype data (in combination with the SNP lists generated before) to the finalized data frames for analysis.
# Loading PRS SNPs for European and Polynesian analyses
load(here("Output/UKBB_Gene_OR.RData"))
load(here("Output/Poly_Gene_OR.RData"))
# Making files for extracting SNPs from the CoreExome VCFs using plink
# if(!file.exists(here("Output/Temp/UKBB_SNPs_Plink.txt"))){
# for_plink <- UKBB_Gene_OR %>%
# select(CHR, BP, RSID) %>%
# mutate(BP2 = BP) %>%
# select(CHR, BP, BP2, RSID)
# write_delim(for_plink, file = here("Output/Temp/UKBB_SNPs_Plink.txt"), col_names = F)
# rm(for_plink)
# }
#
# if(!file.exists(here("Output/Temp/Poly_SNPs_Plink.txt"))){
# for_plink <- Poly_Gene_OR %>%
# select(CHR, BP, RSID) %>%
# mutate(BP2 = BP) %>%
# select(CHR, BP, BP2, RSID)
# write_delim(for_plink, file = here("Output/Temp/Poly_SNPs_Plink.txt"), col_names = F)
# rm(for_plink)
# }
# Extracting SNPs from the CoreExome VCFs using plink
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --vcf /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Imputed_Genotypes/QC1-10_Impute_EUR_only/CZ-MB1.2-QC1.10_EUR_imputed_chr{}.vcf.gz --extract range ', here("Output/Temp/UKBB_SNPs_Plink.txt"), ' --make-bed --out ', here("Output/Temp/UKBB_SNPs_"), 'chr{}" ::: ', paste(unique(UKBB_Gene_OR$CHR), collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile ', here("Output/Temp/UKBB_SNPs_"), 'chr{} --set-missing-var-ids @:# --make-bed --out ', here("Output/Temp/UKBB_SNPs_"), 'chr{}_2" ::: ', paste(unique(UKBB_Gene_OR$CHR), collapse = " ")))
#write_delim(as_tibble(paste0(here("Output/Temp/"), "UKBB_SNPs_chr", unique(UKBB_Gene_OR$CHR), "_2")), file = here("Output/Temp/mergefile_prs1.txt"), delim = "\n", col_names = F)
#system(paste0('source ~/.bashrc; plink1.9b6.10 --merge-list ', here("Output/Temp/"), 'mergefile_prs1.txt --make-bed --out ', here("Output/Temp/"), 'merged_PRS_UKBB'))
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_PRS_UKBB --recode --out ', here("Output/Temp/"), 'merged_PRS_UKBB'))
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted --extract range ', here("Output/Temp/Poly_SNPs_Plink.txt"), ' --make-bed --out ', here("Output/Temp/Poly_SNPs")))
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'Poly_SNPs --recode --out ', here("Output/Temp/"), 'Poly_SNPs'))
# Making bgen file for extracting SNPs from the UK Biobank
# if(!file.exists(here("Output/Temp/PRS_SNPs_BGEN.txt"))){
# tmp <- UKBB_Gene_OR %>%
# select(CHR, BP)
#
# tmp2 <- Poly_Gene_OR %>%
# select(CHR, BP)
#
# tmp3 <- rbind(tmp, tmp2) %>%
# arrange(CHR, BP) %>%
# unique()
#
# bgen_range1 <- tmp3 %>%
# filter(CHR < 10) %>%
# mutate(BGEN = paste0("0", CHR, ":", BP, "-", BP))
#
# bgen_range2 <- tmp3 %>%
# filter(CHR > 9) %>%
# mutate(BGEN = paste0(CHR, ":", BP, "-", BP))
#
# bgen_range <- rbind(bgen_range1, bgen_range2) %>%
# arrange(CHR, BP) %>%
# select(BGEN)
#
# write_delim(bgen_range, file = here("Output/Temp/PRS_SNPs_BGEN.txt"), delim = "\n", col_names = F)
#
# rm(tmp, tmp2, tmp3, bgen_range1, bgen_range2, bgen_range)
# }
# Extracting SNPs from UK Biobank and converting to merged plink file
#system(paste0('source ~/.bashrc; parallel "bgenix -g /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/ukb_imp_chr{}_v3.bgen -vcf -incl-range ', here("Output/Temp", "PRS_SNPs_BGEN.txt"), ' | bcftools reheader -h /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/bgen_to_vcf/new_header.txt | bcftools annotate --rename-chrs /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/bgen_to_vcf/rename_contigs.txt | bgzip -c > ', here("Output/Temp", "chr"), '{}_forPRS.vcf.gz" ::: ', paste(unique(UKBB_Gene_OR$CHR), collapse = " ")))
#system(paste0('source ~/.bashrc; parallel "plink1.9b4.9 --vcf ', here("Output/Temp/"), 'chr{}_forPRS.vcf.gz --make-bed --out ', here("Output/Temp/"), 'chr{}_PRS" ::: ', paste(unique(UKBB_Gene_OR$CHR), collapse = " ")))
#write_delim(as_tibble(paste0(here("Output/Temp/"), "chr", unique(UKBB_Gene_OR$CHR), "_PRS")), file = here("Output/Temp/mergefile_prs.txt"), delim = "\n", col_names = F)
#system(paste0('source ~/.bashrc; plink1.9b6.10 --merge-list ', here("Output/Temp/"), 'mergefile_prs.txt --make-bed --out ', here("Output/Temp/"), 'merged_PRS'))
#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_PRS --recode --out ', here("Output/Temp/"), 'merged_PRS'))
# Making phenotype files ----------------------------------------------------------------------------------------
if(file.exists(here("Output/Phenotypes.RData"))){
load(here("Output/Phenotypes.RData"))
} else {
# CoreExome QC 1-10 Phenotype file
CoreExPheno <- read_delim(here("Data/Phenotypes/CZ-MB1.2-QC1.10_MergedPhenotypes_20082020.txt"), delim = "\t")
# European cohorts = All Ardea (split into each study), EuroGout (includes EireGout), Gout in Aotearoa (combine with AGRIA + DM + RD + NP + LPA => ANZ cohort)
All_Euro_ID <- read_delim(here("Output/Temp/merged_PRS_UKBB.fam"), delim = " ", col_names = F)
CoreExPheno_Euro <- CoreExPheno %>%
filter(Geno.BroadAncestry == "European",
Geno.SampleID %in% All_Euro_ID$X2,
General.Use != "No",
!(Pheno.Study %in% c("Auckland Controls", "Australian Controls", "ESR", "Rheumatoid Arthritis")))
# 1,146 NU, 455 HU controls (1,601 total) + 4,694 gout = either GP or ACR or self-report
# Polynesian cohorts = AGRIA, All Ardea, DM, EuroGout, Aotearoa (both new + old), LPA, Ngati Porou, RD => all split into East and West
All_CoreEx_ID <- read_delim("/Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted.fam", delim = " ", col_names = F)
CoreExPheno_Poly <- CoreExPheno %>%
filter(Geno.BroadAncestry == "Oceanian",
Geno.SampleID %in% All_CoreEx_ID$X2,
General.Use != "No",
!(Pheno.Study %in% c("ESR", "Pacific Trust")),
!is.na(Pheno.GoutSummary))
# 1,021 NU, 248 HU controls (1,269 total) + 1,380 gout
rm(CoreExPheno, All_CoreEx_ID, All_Euro_ID)
# Extracting relevant phenotype information from each cohort
# Phenotypes of interest are: gout, age at recruitment, sex, age at gout onset, flares in last year, presence of tophi, current allopurinol usage, disease duration, serum urate, BMI, total alcohol consumption, sugar-sweetened drink consumption, hypertension, type 2 diabetes, heart disease, diuretic use, serum creatinine/eGFR
# Can get gout, age, sex, ULT usage, serum urate, BMI, Polynesian Group, and PCs from CoreExPheno file (reliable as done by Tanya)
# No duplicates, sex mismatches, or ethnicity mismatches present
# First extracting all Aus/NZ European and Polynesian controls (from Gout in Aotearoa, NPH, RD, DM, AGRIA, lPA)
CoreExPheno_Euro_Control <- CoreExPheno_Euro %>%
filter(Pheno.GoutSummary %in% c("Control", "HyperU"),
Pheno.Study %in% c("Gout in Aotearoa", "Ngati Porou", "Renal Disease", "Diabetes Mellitus", "AGRIA", "LPA")) %>%
select(Pheno.SampleID, Geno.SampleID, Pheno.Study, Pheno.GoutSummary, Pheno.CollectionUrate, Pheno.Gender, Pheno.Age, Geno.PCVector1:Geno.PCVector10, Pheno.BMI)
CoreExPheno_Poly_Control <- CoreExPheno_Poly %>%
filter(Pheno.GoutSummary %in% c("Control", "HyperU"),
Pheno.Study %in% c("Gout in Aotearoa", "Ngati Porou", "Renal Disease", "Diabetes Mellitus", "AGRIA", "LPA")) %>%
select(Pheno.SampleID, Geno.SampleID, Pheno.Study, Pheno.GoutSummary, Pheno.CollectionUrate, Geno.PolynesianGroup, Pheno.Gender, Pheno.Age, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, Pheno.BMI)
# Now extracting important phenos from CoreExPheno for Euro and Poly gout patients
CoreExPheno_Euro_Gout <- CoreExPheno_Euro %>%
filter(Pheno.GoutSummary == "Gout") %>%
select(Pheno.SampleID, Pheno.Study, Pheno.Gender, Pheno.Age, Pheno.GoutSummary:Pheno.WithdrawReason, Geno.SampleID, Geno.GeneticDuplicate:Geno.PCVector10, Geno.BroadAncestry, Pheno.BMI)
CoreExPheno_Poly_Gout <- CoreExPheno_Poly %>%
filter(Pheno.GoutSummary == "Gout") %>%
select(Pheno.SampleID, Pheno.Study, Pheno.Gender, Pheno.Age, Pheno.GoutSummary:Pheno.WithdrawReason, Geno.SampleID, Geno.GeneticDuplicate:Geno.PCVector10, Geno.BroadAncestry:Geno.PCVector10_Oc, Geno.PolynesianGroup:Notes, Pheno.BMI)
rm(CoreExPheno_Euro, CoreExPheno_Poly)
# # Now the second CoreEx phenotype file (nothing of use)
# CoreExPheno2 <- read_delim("Phenotypes/CoreExPheno150618.txt", delim = "\t") %>%
# filter(!(STUDYCOHORT %in% c(5, 8, 9, 11, 13, 14, 15)),
# !is.na(STUDYCOHORT))
#
# rm(CoreExPheno2)
# So I need to extract/derive flares, tophi, onset, duration, total alcohol consumption, sugar-sweetened drink consumption, hypertension, type 2 diabetes, heart disease, diuretic use, serum creatinine/eGFR from original SNPmax files
# Throughout, duration will be derived from onset - age (from CoreExPheno) + 1 (because I am deriving it from ages in years, there is up to 1 additional year duration that needs to be accounted for)
# Filter cohorts to only include those in the CoreExPheno_Euro or CoreExPheno_Poly files
# Going through each cohort alphabetically: AGRIA, Ardea - Ironwood (Euro + Poly), Ardea - LASSO (Euro + Poly), DM (Euro + Poly), EuroGout (Euro + Poly), Gout in Aotearoa (Euro + Poly), LPA (Euro + Poly), Ngati Porou (Euro + Poly), RD (Euro + Poly)
CoreExPheno_Euro_Final <- full_join(CoreExPheno_Euro_Gout, CoreExPheno_Euro_Control)
CoreExPheno_Poly_Final <- full_join(CoreExPheno_Poly_Gout, CoreExPheno_Poly_Control)
CoreExPheno_Final <- full_join(CoreExPheno_Euro_Final, CoreExPheno_Poly_Final)
logicfactor <- function(x) {
as.logical(factor(x, levels = c(1, 2), labels = c("FALSE", "TRUE")))
}
# AGRIA
tmp <- CoreExPheno_Final %>%
filter(Pheno.Study == "AGRIA")
agriapheno <- read_delim(here("Data/Phenotypes/AGRIAPheno.txt"), delim = "\t") %>%
filter(PATIENT %in% tmp$Pheno.SampleID) %>%
select(PATIENT, AGEGOUTDOX, TOPHUS, DIABETES, HIBP, LIPIDS, HEART, CREAT, GOUTCRITERIAB, SUSTOPHUS, BEER, WINE, SPIRITS, OTHALCO, SUGDRINK, TOTALALC, DIURETICSUMMARY, SCREAT) %>%
rename(IID = PATIENT,
AGE1ATK = AGEGOUTDOX) %>%
mutate(NUMATK = rep(NA, nrow(.)),
TOPHUS = as.logical(factor(TOPHUS, levels = c(1, 2, 3), labels = c("FALSE", "TRUE", "NA"))),
GOUTCRITERIAB = logicfactor(GOUTCRITERIAB),
SUSTOPHUS = logicfactor(SUSTOPHUS),
TOPHIGOUT = case_when(TOPHUS == TRUE | GOUTCRITERIAB == TRUE | SUSTOPHUS == TRUE ~ TRUE, is.na(TOPHUS) & is.na(GOUTCRITERIAB) & is.na(SUSTOPHUS) ~ NA, TRUE ~ FALSE),
T2D = logicfactor(DIABETES),
HIBP = logicfactor(HIBP),
LIPID = logicfactor(LIPIDS),
HEART = logicfactor(HEART),
SCREAT = case_when(!is.na(CREAT) & !is.na(SCREAT) ~ (CREAT + SCREAT) / 2 / 88.42, is.na(CREAT) & !is.na(SCREAT) ~ CREAT / 88.42, !is.na(CREAT) & is.na(SCREAT) ~ SCREAT / 88.42, is.na(CREAT) & is.na(SCREAT) ~ NA_real_)) %>%
select(IID, AGE1ATK, TOPHIGOUT, NUMATK, T2D, HIBP, LIPID, HEART, SCREAT, SUGDRINK) %>%
left_join(tmp %>% select(Pheno.SampleID, Pheno.Age), by = c("IID" = "Pheno.SampleID")) %>%
mutate(DURATION = Pheno.Age - AGE1ATK + 1) %>%
rename(AGECOL = Pheno.Age)
# Ardea - Ironwood (CLEAR1, CLEAR2, CRYSTAL, LIGHT)
# European in Ironwood
tmp <- CoreExPheno_Final %>%
filter(Pheno.Study %in% c("Ardea: CLEAR1", "Ardea: CLEAR2", "Ardea: CRYSTAL", "Ardea: LIGHT"))
ironwood_euro_pheno <- read_delim(here("Data/Phenotypes/ArdeaPheno.txt"), delim = "\t") %>%
filter(SUBJID %in% tmp$Pheno.SampleID) %>%
select(SUBJID, ANGINA, DIABETES, HEARTFAILURE, HYPERCHOLESTEROL, HYPERTENSION, HYPERTRIGLY, MI, AGFIDDT, GFDUR, BRTHDTC, CRITBFL, PHNM8FL, )
# %>%
# select(tmp2$X1, -AGE) %>%
# rename(IID = SUBJID,
# NUMATK = GFNUM,
# TOPHIGOUT2 = TOPHIGOUT,
# DURATION2 = GFDUR) %>%
# left_join(ages, by = "IID") %>%
# mutate(CRITBFL = logicfactor(CRITBFL),
# PHNM8FL = logicfactor(PHNM8FL),
# TOPHIFN = logicfactor(TOPHIFN),
# TOPHIGOUT2 = as.logical(factor(TOPHIGOUT2, levels = c("Non-tophaceous gout", "Tophaceous gout"), labels = c("FALSE", "TRUE"))),
# TOPHIGOUT = case_when(CRITBFL == TRUE | PHNM8FL == TRUE | TOPHIFN == TRUE | TOPHIGOUT2 == TRUE | BLTOPHN %in% 1:5 ~ TRUE, is.na(CRITBFL) & is.na(PHNM8FL) & is.na(TOPHIFN) & is.na(BLTOPHN) & is.na(TOPHIGOUT2) ~ NA, TRUE ~ FALSE),
# AGFIDDT = year(as_date(AGFIDDT)),
# AGE1ATK = AGFIDDT - BRTHDTC,
# DURATION = AGECOL - AGE1ATK + 1) %>%
# select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, STUDYACRONYM)
# Polynesians in Ironwood
tmp <- CoreExPheno_Poly_Gout %>%
filter(Pheno.Study %in% c("Ardea: CLEAR1", "Ardea: CLEAR2", "Ardea: CRYSTAL", "Ardea: LIGHT"))
ironwood_poly_pheno <- read_delim(here("Data/Phenotypes/ArdeaPheno.txt"), delim = "\t") %>%
filter(SUBJID %in% tmp$Pheno.SampleID) %>%
select(tmp2$X1, -AGE) %>%
rename(IID = SUBJID,
NUMATK = GFNUM,
TOPHIGOUT2 = TOPHIGOUT,
DURATION2 = GFDUR) %>%
left_join(ages, by = "IID") %>%
mutate(CRITBFL = logicfactor(CRITBFL),
PHNM8FL = logicfactor(PHNM8FL),
TOPHIFN = logicfactor(TOPHIFN),
TOPHIGOUT2 = as.logical(factor(TOPHIGOUT2, levels = c("Non-tophaceous gout", "Tophaceous gout"), labels = c("FALSE", "TRUE"))),
TOPHIGOUT = case_when(CRITBFL == TRUE | PHNM8FL == TRUE | TOPHIFN == TRUE | TOPHIGOUT2 == TRUE | BLTOPHN %in% 1:5 ~ TRUE, is.na(CRITBFL) & is.na(PHNM8FL) & is.na(TOPHIFN) & is.na(BLTOPHN) & is.na(TOPHIGOUT2) ~ NA, TRUE ~ FALSE),
AGFIDDT = year(as_date(AGFIDDT)),
AGE1ATK = AGFIDDT - BRTHDTC,
DURATION = AGECOL - AGE1ATK + 1) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, STUDYACRONYM)
rm(tmp, tmp2, logicfactor)
# Ardea - LASSO
tmp2 <- read_delim(here("Data/Phenotypes/For_selecting_vars/Ardea_LASSO_Flare.txt"), delim = "\t", col_names = F)
lassopheno1 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoFlare.txt"), delim = "\t") %>%
select(tmp2$X1)
tmp2 <- read_delim(here("Data/Phenotypes/For_selecting_vars/Ardea_LASSO_LabChem.txt"), delim = "\t", col_names = F)
lassopheno2 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoLabChem.txt"), delim = "\t") %>%
select(tmp2$X1)
tmp2 <- read_delim(here("Data/Phenotypes/For_selecting_vars/Ardea_LASSO_Main.txt"), delim = "\t", col_names = F)
lassopheno3 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoMain.txt"), delim = "\t") %>%
select(tmp2$X1) %>%
filter(SUBJID %in% lassopheno1$SUBJID)
tmp <- full_join(lassopheno3, lassopheno2)
lassopheno <- full_join(tmp, lassopheno1)
rm(lassopheno1, lassopheno2, lassopheno3, tmp, tmp2)
tmp <- CoreExPheno_Euro_Gout %>%
filter(Pheno.Study == "Ardea: 401")
logicfactor <- function(x) {
as.logical(factor(x, levels = c(0, 1), labels = c("FALSE", "TRUE")))
}
lasso_euro_pheno <- lassopheno %>%
filter(DNAID %in% tmp$Pheno.SampleID) %>%
mutate(IID = as.character(DNAID)) %>%
rename(NUMATK = GFNUM,
DURATION2 = GFDUR) %>%
left_join(filter(ages, Pheno.Study == "Ardea: 401"), by = "IID") %>%
mutate(TOPHIGOUT = logicfactor(BLTPHFN),
AGFIDDT = year(AGFIDDT),
AGE1ATK = AGFIDDT - year(BRTHDTC),
DURATION = AGECOL - AGE1ATK + 1,
AGE1ATK = case_when(!is.na(AGE1ATK) ~ AGE1ATK, is.na(AGE1ATK) ~ AGECOL - DURATION2),
DURATION = case_when(!is.na(DURATION) ~ DURATION, is.na(DURATION) ~ AGECOL - AGE1ATK + 1)) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION)
tmp <- CoreExPheno_Poly_Gout %>%
filter(Pheno.Study == "Ardea: 401")
lasso_poly_pheno <- lassopheno %>%
filter(DNAID %in% tmp$Pheno.SampleID) %>%
mutate(IID = as.character(DNAID)) %>%
rename(NUMATK = GFNUM,
DURATION2 = GFDUR) %>%
left_join(filter(ages, Pheno.Study == "Ardea: 401"), by = "IID") %>%
mutate(TOPHIGOUT = logicfactor(BLTPHFN),
AGFIDDT = year(AGFIDDT),
AGE1ATK = AGFIDDT - year(BRTHDTC),
DURATION = AGECOL - AGE1ATK + 1,
AGE1ATK = case_when(!is.na(AGE1ATK) ~ AGE1ATK, is.na(AGE1ATK) ~ AGECOL - DURATION2),
DURATION = case_when(!is.na(DURATION) ~ DURATION, is.na(DURATION) ~ AGECOL - AGE1ATK + 1)) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION)
rm(lassopheno, tmp, logicfactor)
# Ardea - Other (232 other Ardea study participants (594 and 3170)) - asked ruth if they have separate pheno files - she said we aren't allowed to use them - Tony suggested I just say Tony said why not use them
tmp <- CoreExPheno_Euro_Gout %>%
filter(str_detect(Pheno.Study, "Ardea: 3170") | str_detect(Pheno.Study, "Ardea: 594"))
other_ardea_euro_pheno <- read_delim(here("Data/Phenotypes/ArdeaPheno.txt"), delim = "\t") %>%
filter(SUBJID %in% tmp$Pheno.SampleID)
other_ardea_euro_pheno2 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoMain.txt"), delim = "\t") %>%
filter(DNAID %in% tmp$Pheno.SampleID | SUBJID %in% tmp$Pheno.SampleID)
rm(tmp, other_ardea_euro_pheno, other_ardea_euro_pheno2)
# DM
tmp <- CoreExPheno_Euro_Gout %>%
filter(Pheno.Study == "Diabetes Mellitus")
tmp2 <- read_delim(here("Data/Phenotypes/For_selecting_vars/DM.txt"), delim = "\t", col_names = F)
logicfactor <- function(x) {
as.logical(factor(x, levels = c(1, 2), labels = c("FALSE", "TRUE")))
}
dm_euro_pheno <- read_delim(here("Data/Phenotypes/DMPheno.txt"), delim = "\t") %>%
filter(PATIENT %in% tmp$Pheno.SampleID) %>%
select(tmp2$X1, -AGECOL) %>%
rename(IID = PATIENT) %>%
left_join(ages, by = "IID") %>%
mutate(GOUTCRITERIAB = logicfactor(GOUTCRITERIAB),
SUSTOPHUS = logicfactor(SUSTOPHUS),
TOPHUS = logicfactor(TOPHUS),
TOPHIGOUT = case_when(TOPHUS == TRUE | GOUTCRITERIAB == TRUE | SUSTOPHUS == TRUE ~ TRUE, is.na(TOPHUS) & is.na(GOUTCRITERIAB) & is.na(SUSTOPHUS) ~ NA, TRUE ~ FALSE),
DURATION = AGECOL - AGE1ATK) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION)
tmp <- CoreExPheno_Poly_Gout %>%
filter(Pheno.Study == "Diabetes Mellitus")
dm_poly_pheno <- read_delim(here("Data/Phenotypes/DMPheno.txt"), delim = "\t") %>%
filter(PATIENT %in% tmp$Pheno.SampleID) %>%
select(tmp2$X1, -AGECOL) %>%
rename(IID = PATIENT) %>%
left_join(ages, by = "IID") %>%
mutate(GOUTCRITERIAB = logicfactor(GOUTCRITERIAB),
SUSTOPHUS = logicfactor(SUSTOPHUS),
TOPHUS = logicfactor(TOPHUS),
TOPHIGOUT = case_when(TOPHUS == TRUE | GOUTCRITERIAB == TRUE | SUSTOPHUS == TRUE ~ TRUE, is.na(TOPHUS) & is.na(GOUTCRITERIAB) & is.na(SUSTOPHUS) ~ NA, TRUE ~ FALSE),
DURATION = AGECOL - AGE1ATK) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION)
rm(tmp, tmp2, logicfactor)
# EuroGout
tmp <- CoreExPheno_Euro_Gout %>%
filter(Pheno.Study == "EuroGout")
tmp2 <- read_delim(here("Data/Phenotypes/For_selecting_vars/EuroGout.txt"), delim = "\t", col_names = F)
logicfactor <- function(x) {
as.logical(factor(x, levels = c(0, 1), labels = c("FALSE", "TRUE")))
}
eurogout_euro_pheno <- read_delim(here("Data/Phenotypes/EuroGoutPheno.txt"), delim = "\t") %>%
filter(SUBJECT %in% tmp$Pheno.SampleID) %>%
select(tmp2$X1) %>%
rename(IID = SUBJECT,
AGE1ATK = AGEFIRSTATTK,
NUMATK = NUMATTACKS,
DURATION2 = DURATIONGOUT) %>%
left_join(filter(ages, Pheno.Study == "EuroGout"), by = "IID") %>%
mutate(TOPHUS = logicfactor(TOPHUS),
NUMTOPHI = as.factor(NUMTOPHI),
ACRB = logicfactor(ACRB),
ACRC8 = logicfactor(ACRC8),
TOPHIGOUT = case_when(TOPHUS == TRUE | NUMTOPHI %in% c(1, 2, 3) | ACRB == TRUE | ACRC8 == TRUE ~ TRUE, is.na(TOPHUS) & is.na(NUMTOPHI) & is.na(ACRB) & is.na(ACRC8) ~ NA, TRUE ~ FALSE),
DURATION = AGECOL - AGE1ATK + 1,
DURATION = case_when(!is.na(DURATION) ~ DURATION, is.na(DURATION) ~ DURATION2),
AGE1ATK = case_when(!is.na(AGE1ATK) ~ AGE1ATK, is.na(AGE1ATK) ~ AGECOL - DURATION - 1)) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION)
tmp <- CoreExPheno_Poly_Gout %>%
filter(Pheno.Study == "EuroGout")
eurogout_poly_pheno <- read_delim(here("Data/Phenotypes/EuroGoutPheno.txt"), delim = "\t") %>%
filter(SUBJECT %in% tmp$Pheno.SampleID) %>%
select(tmp2$X1) %>%
rename(IID = SUBJECT,
AGE1ATK = AGEFIRSTATTK,
NUMATK = NUMATTACKS,
DURATION2 = DURATIONGOUT) %>%
left_join(filter(ages, Pheno.Study == "EuroGout"), by = "IID") %>%
mutate(TOPHUS = logicfactor(TOPHUS),
NUMTOPHI = as.factor(NUMTOPHI),
ACRB = logicfactor(ACRB),
ACRC8 = logicfactor(ACRC8),
TOPHIGOUT = case_when(TOPHUS == TRUE | NUMTOPHI %in% c(1, 2, 3) | ACRB == TRUE | ACRC8 == TRUE ~ TRUE, is.na(TOPHUS) & is.na(NUMTOPHI) & is.na(ACRB) & is.na(ACRC8) ~ NA, TRUE ~ FALSE),
DURATION = AGECOL - AGE1ATK + 1,
DURATION = case_when(!is.na(DURATION) ~ DURATION, is.na(DURATION) ~ DURATION2),
AGE1ATK = case_when(!is.na(AGE1ATK) ~ AGE1ATK, is.na(AGE1ATK) ~ AGECOL - DURATION - 1)) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION) # All 4 accounted for
rm(tmp, tmp2, logicfactor)
# Gout in Aotearoa
tmp <- CoreExPheno_Euro_Gout %>%
filter(Pheno.Study == "Gout in Aotearoa")
tmp2 <- read_delim(here("Data/Phenotypes/For_selecting_vars/GoutinAotearoa.txt"), delim = "\t", col_names = F)
logicfactor <- function(x) {
as.logical(factor(x, levels = c(1, 2), labels = c("FALSE", "TRUE")))
}
aotearoa_euro_pheno <- read_delim(here("Data/Phenotypes/NZPheno.txt"), delim = "\t") %>%
filter(SUBJECT %in% tmp$Pheno.SampleID) %>%
select(tmp2$X1, -AGECOL) %>%
rename(IID = SUBJECT) %>%
left_join(ages, by = "IID") %>%
mutate(GOUTCRITERIAB = logicfactor(GOUTCRITERIAB),
SUSTOPHUS = logicfactor(SUSTOPHUS),
TOPHUS = logicfactor(TOPHUS),
TOPHIGOUT2 = as.logical(factor(TOPHIGOUT, levels = c(0, 1, 2), labels = c("NA", "FALSE", "TRUE"))),
TOPHIGOUT = case_when(GOUTCRITERIAB == TRUE | SUSTOPHUS == TRUE | TOPHUS == TRUE | TOPHIGOUT2 == TRUE ~ TRUE, is.na(TOPHUS) & is.na(GOUTCRITERIAB) & is.na(SUSTOPHUS) & is.na(TOPHIGOUT2) ~ NA, TRUE ~ FALSE),
DURATION = AGECOL - AGE1ATK + 1) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION)
tmp <- CoreExPheno_Poly_Gout %>%
filter(Pheno.Study == "Gout in Aotearoa")
aotearoa_poly_pheno <- read_delim(here("Data/Phenotypes/NZPheno.txt"), delim = "\t") %>%
filter(SUBJECT %in% tmp$Pheno.SampleID) %>%
select(tmp2$X1, -AGECOL) %>%
rename(IID = SUBJECT) %>%
left_join(ages, by = "IID") %>%
mutate(GOUTCRITERIAB = logicfactor(GOUTCRITERIAB),
SUSTOPHUS = logicfactor(SUSTOPHUS),
TOPHUS = logicfactor(TOPHUS),
TOPHIGOUT2 = as.logical(factor(TOPHIGOUT, levels = c(0, 1, 2), labels = c("NA", "FALSE", "TRUE"))),
TOPHIGOUT = case_when(GOUTCRITERIAB == TRUE | SUSTOPHUS == TRUE | TOPHUS == TRUE | TOPHIGOUT2 == TRUE ~ TRUE, is.na(TOPHUS) & is.na(GOUTCRITERIAB) & is.na(SUSTOPHUS) & is.na(TOPHIGOUT2) ~ NA, TRUE ~ FALSE),
DURATION = AGECOL - AGE1ATK + 1) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION)
rm(tmp, tmp2, logicfactor)
# LPA (don't have any phenotypes of interest for this study)
lpa_euro_pheno <- read_delim(here("Data/Phenotypes/LPAPheno.txt"), delim = "\t") %>%
filter(SUBJECT %in% CoreExPheno_Euro_Gout$Pheno.SampleID) # All 349 accounted for
lpa_poly_pheno <- read_delim(here("Data/Phenotypes/LPAPheno.txt"), delim = "\t") %>%
filter(SUBJECT %in% CoreExPheno_Poly_Gout$Pheno.SampleID) # All 15 accounted for
rm(lpa_euro_pheno, lpa_poly_pheno)
# Ngati Porou
tmp <- CoreExPheno_Euro_Gout %>%
filter(Pheno.Study == "Ngati Porou")
tmp2 <- read_delim(here("Data/Phenotypes/For_selecting_vars/NPH.txt"), delim = "\t", col_names = F)
logicfactor <- function(x) {
as.logical(factor(x, levels = c(1, 2), labels = c("FALSE", "TRUE")))
}
nph_euro_pheno <- read_delim(here("Data/Phenotypes/NPHPheno.txt"), delim = "\t") %>%
filter(PATIENT %in% tmp$Pheno.SampleID) %>%
select(tmp2$X1, -AGECOL) %>%
rename(IID = PATIENT) %>%
left_join(ages, by = "IID") %>%
mutate(GOUTCRITERIAB = logicfactor(GOUTCRITERIAB),
SUSTOPHUS = logicfactor(SUSTOPHUS),
TOPHUS = logicfactor(TOPHUS),
TOPHIGOUT2 = as.logical(factor(TOPHIGOUT, levels = c(1, 2, 3), labels = c("NA", "FALSE", "TRUE"))),
TOPHIGOUT = case_when(TOPHUS == TRUE | SUSTOPHUS == TRUE | GOUTCRITERIAB == TRUE | TOPHIGOUT2 == TRUE ~ TRUE, is.na(TOPHUS) & is.na(GOUTCRITERIAB) & is.na(SUSTOPHUS) & is.na(TOPHIGOUT2) ~ NA, TRUE ~ FALSE),
DURATION = AGECOL - AGE1ATK + 1) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION)
tmp <- CoreExPheno_Poly_Gout %>%
filter(Pheno.Study == "Ngati Porou")
nph_poly_pheno <- read_delim(here("Data/Phenotypes/NPHPheno.txt"), delim = "\t") %>%
filter(PATIENT %in% tmp$Pheno.SampleID) %>%
select(tmp2$X1, -AGECOL) %>%
rename(IID = PATIENT) %>%
left_join(ages, by = "IID") %>%
mutate(GOUTCRITERIAB = logicfactor(GOUTCRITERIAB),
SUSTOPHUS = logicfactor(SUSTOPHUS),
TOPHUS = logicfactor(TOPHUS),
TOPHIGOUT2 = as.logical(factor(TOPHIGOUT, levels = c(1, 2, 3), labels = c("NA", "FALSE", "TRUE"))),
TOPHIGOUT = case_when(TOPHUS == TRUE | SUSTOPHUS == TRUE | GOUTCRITERIAB == TRUE | TOPHIGOUT2 == TRUE ~ TRUE, is.na(TOPHUS) & is.na(GOUTCRITERIAB) & is.na(SUSTOPHUS) & is.na(TOPHIGOUT2) ~ NA, TRUE ~ FALSE),
DURATION = AGECOL - AGE1ATK + 1) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION)
rm(tmp2, logicfactor)
# RD
tmp <- CoreExPheno_Euro_Gout %>%
filter(Pheno.Study == "Renal Disease")
tmp2 <- read_delim(here("Data/Phenotypes/For_selecting_vars/RD.txt"), delim = "\t", col_names = F)
logicfactor <- function(x) {
as.logical(factor(x, levels = c(1, 2), labels = c("FALSE", "TRUE")))
}
rd_euro_pheno <- read_delim(here("Data/Phenotypes/RDPheno.txt"), delim = "\t") %>%
filter(PATIENT %in% tmp$Pheno.SampleID) %>%
select(tmp2$X1, -AGECOL) %>%
rename(IID = PATIENT) %>%
left_join(ages, by = "IID") %>%
mutate(NUMATK = rep(NA, nrow(.)),
GOUTCRITERIAB = logicfactor(GOUTCRITERIAB),
SUSTOPHUS = logicfactor(SUSTOPHUS),
TOPHUS = logicfactor(TOPHUS),
TOPHIGOUT = case_when(GOUTCRITERIAB == TRUE | SUSTOPHUS == TRUE | TOPHUS == TRUE ~ TRUE, is.na(TOPHUS) & is.na(GOUTCRITERIAB) & is.na(SUSTOPHUS) ~ NA, TRUE ~ FALSE),
DURATION = AGECOL - AGE1ATK + 1) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION)
tmp <- CoreExPheno_Poly_Gout %>%
filter(Pheno.Study == "Renal Disease")
rd_poly_pheno <- read_delim(here("Data/Phenotypes/RDPheno.txt"), delim = "\t") %>%
filter(PATIENT %in% tmp$Pheno.SampleID) %>%
select(tmp2$X1, -AGECOL) %>%
rename(IID = PATIENT) %>%
left_join(ages, by = "IID") %>%
mutate(NUMATK = rep(NA, nrow(.)),
GOUTCRITERIAB = logicfactor(GOUTCRITERIAB),
SUSTOPHUS = logicfactor(SUSTOPHUS),
TOPHUS = logicfactor(TOPHUS),
TOPHIGOUT = case_when(GOUTCRITERIAB == TRUE | SUSTOPHUS == TRUE | TOPHUS == TRUE ~ TRUE, is.na(TOPHUS) & is.na(GOUTCRITERIAB) & is.na(SUSTOPHUS) ~ NA, TRUE ~ FALSE),
DURATION = AGECOL - AGE1ATK + 1) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION)
rm(tmp, tmp2, logicfactor, ages)
# Combining AGRIA, Aotearoa, DM, NPH and RD into ANZ
tmp <- agriapheno %>%
mutate(COHORT = rep("AGRIA", nrow(.))) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
tmp2 <- aotearoa_euro_pheno %>%
mutate(COHORT = rep("Gout in Aotearoa", nrow(.))) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
tmp3 <- dm_euro_pheno %>%
mutate(COHORT = rep("Diabetes Mellitus", nrow(.))) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
tmp4 <- nph_euro_pheno %>%
mutate(COHORT = rep("Ngati Porou Hauora", nrow(.))) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
tmp5 <- rd_euro_pheno %>%
mutate(COHORT = rep("Renal Disease", nrow(.))) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
anz_euro_pheno <- rbind(tmp, tmp2, tmp3, tmp4, tmp5) %>%
mutate(COHORT = as.factor(COHORT)) %>%
arrange(IID)
rm(aotearoa_euro_pheno, agriapheno, dm_euro_pheno, nph_euro_pheno, rd_euro_pheno, tmp, tmp2, tmp3, tmp4, tmp5)
# Combining all Polynesian cohorts together
tmp <- aotearoa_poly_pheno %>%
mutate(COHORT = rep("Gout in Aotearoa", nrow(.))) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
tmp2 <- dm_poly_pheno %>%
mutate(COHORT = rep("Diabetes Mellitus", nrow(.))) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
tmp3 <- eurogout_poly_pheno %>%
mutate(COHORT = rep("EuroGout", nrow(.))) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
tmp4 <- ironwood_poly_pheno %>%
mutate(COHORT = paste0("Ardea - ", STUDYACRONYM)) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
tmp5 <- lasso_poly_pheno %>%
mutate(COHORT = rep("Ardea - LASSO", nrow(.))) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
tmp6 <- nph_poly_pheno %>%
mutate(COHORT = rep("Ngati Porou Hauora", nrow(.))) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
tmp7 <- rd_poly_pheno %>%
mutate(COHORT = rep("Renal Disease", nrow(.))) %>%
select(IID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT)
poly_pheno <- rbind(tmp, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7) %>%
mutate(COHORT = as.factor(COHORT)) %>%
arrange(IID) %>%
unique()
rm(aotearoa_poly_pheno, dm_poly_pheno, eurogout_poly_pheno, ironwood_poly_pheno, lasso_poly_pheno, nph_poly_pheno, rd_poly_pheno, tmp, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7)
# Now I want to merge the rest of the phenotypes from the CoreExPheno files into the final-ish phenotype files
anz_euro_pheno <- anz_euro_pheno %>%
left_join(CoreExPheno_Euro_Gout, by = c("IID" = "Pheno.SampleID")) %>%
rename(AGECOL = Pheno.Age,
PC1 = Geno.PCVector1,
PC2 = Geno.PCVector2,
PC3 = Geno.PCVector3,
PC4 = Geno.PCVector4,
PC5 = Geno.PCVector5,
PC6 = Geno.PCVector6,
PC7 = Geno.PCVector7,
PC8 = Geno.PCVector8,
PC9 = Geno.PCVector9,
PC10 = Geno.PCVector10) %>%
mutate(SEX = as.factor(Pheno.Gender),
GOUT = rep(TRUE, nrow(.)),
SURICACID = Pheno.CollectionUrate / 0.05949,
ULT = as.logical(factor(Pheno.UrateTherapy, levels = c("No", "Yes"), labels = c("FALSE", "TRUE")))) %>%
select(IID, Geno.SampleID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT, SEX, AGECOL, GOUT, SURICACID, ULT, PC1:PC10) %>%
filter(!(is.na(NUMATK) & is.na(TOPHIGOUT) & is.na(AGE1ATK)))
eurogout_euro_pheno <- eurogout_euro_pheno %>%
left_join(CoreExPheno_Euro_Gout, by = c("IID" = "Pheno.SampleID")) %>%
filter(Geno.GeneticSex == Pheno.Gender,
!(is.na(NUMATK) & is.na(TOPHIGOUT) & is.na(AGE1ATK))) %>%
rename(AGECOL = Pheno.Age,
PC1 = Geno.PCVector1,
PC2 = Geno.PCVector2,
PC3 = Geno.PCVector3,
PC4 = Geno.PCVector4,
PC5 = Geno.PCVector5,
PC6 = Geno.PCVector6,
PC7 = Geno.PCVector7,
PC8 = Geno.PCVector8,
PC9 = Geno.PCVector9,
PC10 = Geno.PCVector10) %>%
mutate(SEX = as.factor(Pheno.Gender),
GOUT = rep(TRUE, nrow(.)),
SURICACID = Pheno.CollectionUrate / 0.05949,
ULT = as.logical(factor(Pheno.UrateTherapy, levels = c("No", "Yes"), labels = c("FALSE", "TRUE"))),
COHORT = rep("EuroGout", nrow(.))) %>%
select(IID, Geno.SampleID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT, SEX, AGECOL, GOUT, SURICACID, ULT, PC1:PC10)
ironwood_euro_pheno <- ironwood_euro_pheno %>%
mutate(COHORT = paste0("Ardea: ", STUDYACRONYM)) %>%
left_join(CoreExPheno_Euro_Gout, by = c("IID" = "Pheno.SampleID", "COHORT" = "Pheno.Study")) %>%
unique() %>%
filter(Geno.GeneticSex == Pheno.Gender,
!(is.na(NUMATK) & is.na(TOPHIGOUT) & is.na(AGE1ATK))) %>%
rename(AGECOL = Pheno.Age,
PC1 = Geno.PCVector1,
PC2 = Geno.PCVector2,
PC3 = Geno.PCVector3,
PC4 = Geno.PCVector4,
PC5 = Geno.PCVector5,
PC6 = Geno.PCVector6,
PC7 = Geno.PCVector7,
PC8 = Geno.PCVector8,
PC9 = Geno.PCVector9,
PC10 = Geno.PCVector10) %>%
mutate(SEX = as.factor(Pheno.Gender),
GOUT = rep(TRUE, nrow(.)),
SURICACID = Pheno.CollectionUrate / 0.05949,
ULT = as.logical(factor(Pheno.UrateTherapy, levels = c("No", "Yes"), labels = c("FALSE", "TRUE"))),
ULT = case_when(COHORT %in% c("Ardea: CLEAR1", "Ardea: CLEAR2") | (COHORT == "Ardea: CRYSTAL" & SURICACID < 8) ~ TRUE, COHORT == "Ardea: LIGHT" ~ NA, TRUE ~ ULT)) %>%
select(IID, Geno.SampleID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT, SEX, AGECOL, GOUT, SURICACID, ULT, PC1:PC10)
lasso_euro_pheno <- lasso_euro_pheno %>%
mutate(COHORT = rep("Ardea: 401", nrow(.))) %>%
left_join(CoreExPheno_Euro_Gout, by = c("IID" = "Pheno.SampleID", "COHORT" = "Pheno.Study")) %>%
filter(Geno.GeneticSex == Pheno.Gender,
!(is.na(NUMATK) & is.na(TOPHIGOUT) & is.na(AGE1ATK))) %>%
rename(AGECOL = Pheno.Age,
PC1 = Geno.PCVector1,
PC2 = Geno.PCVector2,
PC3 = Geno.PCVector3,
PC4 = Geno.PCVector4,
PC5 = Geno.PCVector5,
PC6 = Geno.PCVector6,
PC7 = Geno.PCVector7,
PC8 = Geno.PCVector8,
PC9 = Geno.PCVector9,
PC10 = Geno.PCVector10) %>%
mutate(SEX = as.factor(Pheno.Gender),
GOUT = rep(TRUE, nrow(.)),
SURICACID = Pheno.CollectionUrate / 0.05949,
ULT = as.logical(factor(Pheno.UrateTherapy, levels = c("No", "Yes"), labels = c("FALSE", "TRUE"))),
ULT = case_when(SURICACID < 8 ~ TRUE, TRUE ~ ULT)) %>%
select(IID, Geno.SampleID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT, SEX, AGECOL, GOUT, SURICACID, ULT, PC1:PC10)
# Now figuring out which Ardea cohorts are similar based on recruitment
# CLEAR1 = Gout + inadequate response to Allopurinol (persistent HU after treatment) + at least 2 flares in prior year - all US
# CLEAR2 = Exactly the same as CLEAR1 - half US, rest international
# CRYSTAL = All gout recruited + > 8mg/dL off ULT or > 6 mg/dL on ULT + all had tophi - Most US, rest international
# LIGHT = Gout + History of intolerance/contraindication to allop or febux + > 6.5 mg/dL urate - Most US, rest international
# LASSO = Gout + at least 2 flares in prior year + > 8mg/dL off ULT or > 6.5 mg/dL on ULT - Most US, rest international
tmp <- CoreExPheno_Euro_Gout %>%
filter(Pheno.Study %in% c("Ardea: CLEAR1")) # All 242 United States
tmp <- CoreExPheno_Euro_Gout %>%
filter(Pheno.Study %in% c("Ardea: CLEAR2")) # 3 Aus, 4 Belgium, 17 Germany, 6 NZ, 37 South Africa, 1 Swiss, 60 Ukraine, 113 US
tmp <- CoreExPheno_Euro_Gout %>%
filter(Pheno.Study %in% c("Ardea: CRYSTAL")) # 14 Aus, 12 Canada, 4 NZ, 21 Poland, 2 Swiss, 122 US
tmp <- CoreExPheno_Euro_Gout %>%
filter(Pheno.Study %in% c("Ardea: LIGHT")) # 2 Aus, 9 Belgium, 7 Canada, 5 German, 2 NZ, 13 South Africa, 75 US
tmp <- CoreExPheno_Euro_Gout %>%
filter(Pheno.Study %in% c("Ardea: 401")) # 63 Aus, 10 Belgium, 34 Canada, 5 German, 27 NZ, 119 South Africa, 593 US
# So in summary, CLEAR1 and CLEAR2 are nearly identical except for some internationals in CLEAR2, LASSO same except also included HU gout patients off ULT, CRYSTAL same as LASSO except all had tophi instead of >2 flares, LIGHT is kind of different from rest
# I could extract all 243 LASSO patients on ULT at screening and include them along with CLEAR1 and CLEAR2 for an identically selected cohort of 750 patients (all inadequate response to ULT + > 1 flare in last year)
# The remaining 600 LASSO could stay as their own cohort (none on ULT, all HU and flares > 1 in last year)
# Then CRYSTAL (N = 175) all have tophi, and either are HU or don't respond to ULT
# LIGHT (N = 113) are all HU and can't take ALLO or FEBUX due to contraindication (might be able to combine with LASSO - no ULT)
tmp <- ironwood_euro_pheno %>%
filter(COHORT %in% c("Ardea: CLEAR1", "Ardea: CLEAR2"))
clear_lasso_euro_pheno <- lasso_euro_pheno %>%
filter(ULT == TRUE) %>%
rbind(tmp) %>%
arrange(IID)
lasso_other_euro_pheno <- lasso_euro_pheno %>%
filter(is.na(ULT))
rm(lasso_euro_pheno)
crystal_euro_pheno <- ironwood_euro_pheno %>%
filter(COHORT == "Ardea: CRYSTAL")
light_euro_pheno <- ironwood_euro_pheno %>%
filter(COHORT == "Ardea: LIGHT")
rm(ironwood_euro_pheno, tmp, tmp2)
# Now fixing up the polynesian gout phenotype files
poly_pheno <- poly_pheno %>%
left_join(CoreExPheno_Poly_Gout, by = c("IID" = "Pheno.SampleID", "COHORT" = "Pheno.Study")) %>%
filter(Geno.GeneticSex == Pheno.Gender,
Geno.AncestryMatch != "Self-reported ethnicity does not match genetic ancestry",
!(is.na(NUMATK) & is.na(TOPHIGOUT) & is.na(AGE1ATK))) %>%
rename(AGECOL = Pheno.Age,
PC1 = Geno.PCVector1,
PC2 = Geno.PCVector2,
PC3 = Geno.PCVector3,
PC4 = Geno.PCVector4,
PC5 = Geno.PCVector5,
PC6 = Geno.PCVector6,
PC7 = Geno.PCVector7,
PC8 = Geno.PCVector8,
PC9 = Geno.PCVector9,
PC10 = Geno.PCVector10,
PC1_Oc = Geno.PCVector1_Oc,
PC2_Oc = Geno.PCVector2_Oc,
PC3_Oc = Geno.PCVector3_Oc,
PC4_Oc = Geno.PCVector4_Oc,
PC5_Oc = Geno.PCVector5_Oc,
PC6_Oc = Geno.PCVector6_Oc,
PC7_Oc = Geno.PCVector7_Oc,
PC8_Oc = Geno.PCVector8_Oc,
PC9_Oc = Geno.PCVector9_Oc,
PC10_Oc = Geno.PCVector10_Oc) %>%
mutate(SEX = as.factor(Pheno.Gender),
GOUT = rep(TRUE, nrow(.)),
SURICACID = Pheno.CollectionUrate / 0.05949,
ULT = as.logical(factor(Pheno.UrateTherapy, levels = c("No", "Yes"), labels = c("FALSE", "TRUE")))) %>%
select(IID, Geno.SampleID, NUMATK, TOPHIGOUT, AGE1ATK, DURATION, COHORT, SEX, AGECOL, GOUT, SURICACID, ULT, PC1:PC10, PC1_Oc:PC10_Oc, Geno.PolynesianGroup)
eastpoly_pheno <- poly_pheno %>%
filter(Geno.PolynesianGroup == "East Polynesian")
westpoly_pheno <- poly_pheno %>%
filter(Geno.PolynesianGroup == "West Polynesian")
rm(poly_pheno)
# Finally cleaning up the control files
control_anz_euro_pheno <- CoreExPheno_Euro_Control
control_eastpoly_pheno <- CoreExPheno_EastPoly_Control
control_westpoly_pheno <- CoreExPheno_WestPoly_Control
rm(CoreExPheno_EastPoly_Control, CoreExPheno_Euro_Control, CoreExPheno_Euro_Gout, CoreExPheno_Poly_Gout, CoreExPheno_WestPoly_Control)
save(anz_euro_pheno, clear_lasso_euro_pheno, control_anz_euro_pheno, control_eastpoly_pheno, control_westpoly_pheno, crystal_euro_pheno, eastpoly_pheno, eurogout_euro_pheno, lasso_other_euro_pheno, light_euro_pheno, westpoly_pheno, file = here("Output/Phenotypes.RData"))
}
The PRS was then calculated for all European samples. Using the results of the UKBB GWAS, 55 PRS metrics were generated, this includes the complete PRS, the 27 PRS’s with a single variant excluded and the 27 variants individually.
make_prs_euro <- function(ids) {
# Load in plink genotype files + rename column names + filter to only include IDs of cohort of interest
map <- read_delim(here("Output/Temp/merged_PRS_UKBB.map"), delim = "\t", col_names = FALSE)
x <- read_delim(here("Output/Temp/merged_PRS_UKBB.ped"), delim = " ", col_names = FALSE, col_types = cols(.default = col_character()))
colnames(x)[seq(from = 7, to = ncol(x) - 1, by = 2)] <- str_c(map$X2, "_1")
colnames(x)[seq(from = 8, to = (ncol(x)), by = 2)] <- str_c(map$X2, "_2")
colnames(x)[1:6] <- c("FID", "IID", "PID", "MID", "SEX", "AFF")
x <- x %>% filter(IID %in% ids)
# Convert character genotypes into numeric genotypes based on risk allele = 1
num_cols <- ncol(x)
for (i in 1:nrow(UKBB_Gene_OR)) {
x[[2 * i + 5]] <- x[[2 * i + 5]] %>%
str_replace("0", "NA") %>%
str_replace(UKBB_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(UKBB_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x[[2 * i + 6]] <- x[[2 * i + 6]] %>%
str_replace("0", "NA") %>%
str_replace(UKBB_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(UKBB_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x <- x %>%
mutate("TEMP" = (x[[2 * i + 5]] + x[[2 * i + 6]]))
colnames(x) <- c(colnames(x[1:((num_cols - 1) + i)]), UKBB_Gene_OR[[i, "RSID"]])
}
x <- x %>%
select(2, (num_cols + 1):ncol(x))
# Save this dataframe for individual SNP analysis
x1 <- x
# Now to calculate the PRS
for(i in 1:nrow(UKBB_Gene_OR)) {
x[i + 1] <- x[[i + 1]] * UKBB_Gene_OR[[i, "OR"]]
}
x$PRS <- rowSums(x[2:(ncol(x))])
# Calculating leave-one-out PRS's
num_cols <- ncol(x)
for(i in 1:(ncol(x) - 2)) {
x[i + 1] <- x[[num_cols]] - x[[i + 1]]
}
x <- x %>%
select(IID, PRS, 2:(ncol(x) - 1))
colnames(x)[3:ncol(x)] <- str_c(paste0(colnames(x[2]), "_Less_"), colnames(x[3:ncol(x)]))
# Joining them back together
x <- x %>% left_join(x1)
assign("prs", x, envir = .GlobalEnv)
}
# Australian/NZ cases
ids <- anz_euro_pheno$Geno.SampleID
make_prs_euro(ids = ids)
anz_euro_pheno_prs <- anz_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID"))
save(anz_euro_pheno_prs, file = here("Output/anz_euro_pheno_prs.RData"))
rm(ids, prs)
# Australian/NZ controls
ids <- control_anz_euro_pheno$Geno.SampleID
make_prs_euro(ids = ids)
control_anz_euro_pheno_prs <- control_anz_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID"))
save(control_anz_euro_pheno_prs, file = here("Output/control_anz_euro_pheno_prs.RData"))
rm(ids, prs)
# Ardea - CLEAR1, CLEAR2 and LASSO (ULT)
ids <- clear_lasso_euro_pheno$Geno.SampleID
make_prs_euro(ids = ids)
clear_lasso_euro_pheno_prs <- clear_lasso_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID"))
save(clear_lasso_euro_pheno_prs, file = here("Output/clear_lasso_euro_pheno_prs.RData"))
rm(ids, prs)
# Ardea - CRYSTAL
ids <- crystal_euro_pheno$Geno.SampleID
make_prs_euro(ids = ids)
crystal_euro_pheno_prs <- crystal_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID"))
save(crystal_euro_pheno_prs, file = here("Output/crystal_euro_pheno_prs.RData"))
rm(ids, prs)
# EuroGout
ids <- eurogout_euro_pheno$Geno.SampleID
make_prs_euro(ids = ids)
eurogout_euro_pheno_prs <- eurogout_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID"))
save(eurogout_euro_pheno_prs, file = here("Output/eurogout_euro_pheno_prs.RData"))
rm(ids, prs)
# Ardea - LASSO (all others)
ids <- lasso_other_euro_pheno$Geno.SampleID
make_prs_euro(ids = ids)
lasso_other_euro_pheno_prs <- lasso_other_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID"))
save(lasso_other_euro_pheno_prs, file = here("Output/lasso_other_euro_pheno_prs.RData"))
rm(ids, prs)
# Ardea - LIGHT
ids <- light_euro_pheno$Geno.SampleID
make_prs_euro(ids = ids)
light_euro_pheno_prs <- light_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID"))
save(light_euro_pheno_prs, file = here("Output/light_euro_pheno_prs.RData"))
rm(ids, prs, make_prs_euro)
# UK Biobank
load("/Volumes/userdata/student_users/nicksumpter/Documents/PhD/Cluster/self_report_med.RData")
tmp <- self_report_med %>%
mutate(IID = eid,
ULT = case_when(allopurinol == TRUE | sulphinpyrazone == TRUE | probenecid == TRUE ~ TRUE, TRUE ~ FALSE)) %>%
select(IID, ULT)
load("/Volumes/userdata/student_users/nicksumpter/Documents/PhD/Cluster/final_data_prs.RData")
ukbb_euro_pheno <- final_data_prs %>%
select(eid, gout, age:urate) %>%
rename(IID = eid,
GOUT = gout,
AGECOL = age,
SEX = sex,
SURICACID = urate) %>%
mutate(COHORT = "UK Biobank") %>%
left_join(tmp) %>%
select(IID, COHORT, SEX, AGECOL, GOUT, SURICACID, ULT)
ids <- ukbb_euro_pheno$IID
# Load in plink genotype files + rename column names + filter to only include IDs of cohort of interest
map <- read_delim(here("Output/Temp/merged_PRS.map"), delim = "\t", col_names = FALSE)
map <- map %>%
separate(X2, into = c("X2", "remove"), sep = ",") %>%
select(-remove)
x <- read_delim(here("Output/Temp/merged_PRS.ped"), delim = " ", col_names = FALSE, col_types = cols(.default = col_character()))
colnames(x)[seq(from = 7, to = ncol(x) - 1, by = 2)] <- str_c(map$X2, "_1")
colnames(x)[seq(from = 8, to = (ncol(x)), by = 2)] <- str_c(map$X2, "_2")
colnames(x)[1:6] <- c("FID", "IID", "PID", "MID", "SEX", "AFF")
tmp <- c("FID", "IID", "PID", "MID", "SEX", "AFF", paste0(UKBB_Gene_OR$RSID, "_1"), paste0(UKBB_Gene_OR$RSID, "_2"))
x <- x[, colnames(x) %in% tmp]
x <- x %>% mutate(IID = as.numeric(IID)) %>% filter(IID %in% ids)
# Convert character genotypes into numeric genotypes based on risk allele = 1
num_cols <- ncol(x)
for (i in 1:nrow(UKBB_Gene_OR)) {
x[[2 * i + 5]] <- x[[2 * i + 5]] %>%
str_replace("0", "NA") %>%
str_replace(UKBB_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(UKBB_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x[[2 * i + 6]] <- x[[2 * i + 6]] %>%
str_replace("0", "NA") %>%
str_replace(UKBB_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(UKBB_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x <- x %>%
mutate("TEMP" = (x[[2 * i + 5]] + x[[2 * i + 6]]))
colnames(x) <- c(colnames(x[1:((num_cols - 1) + i)]), UKBB_Gene_OR[[i, "RSID"]])
}
x <- x %>%
select(2, (num_cols + 1):ncol(x))
# Save this dataframe for individual SNP analysis
x1 <- x
# Now to calculate the PRS
for(i in 1:nrow(UKBB_Gene_OR)) {
x[i + 1] <- x[[i + 1]] * UKBB_Gene_OR[[i, "OR"]]
}
x$PRS <- rowSums(x[2:(ncol(x))])
# Calculating leave-one-out PRS's
num_cols <- ncol(x)
for(i in 1:(ncol(x) - 2)) {
x[i + 1] <- x[[num_cols]] - x[[i + 1]]
}
x <- x %>%
select(IID, PRS, 2:(ncol(x) - 1))
colnames(x)[3:ncol(x)] <- str_c(paste0(colnames(x[2]), "_Less_"), colnames(x[3:ncol(x)]))
# Joining them back together
x <- x %>% left_join(x1)
ukbb_euro_pheno_prs <- ukbb_euro_pheno %>%
left_join(x, by = c("IID"))
save(ukbb_euro_pheno_prs, file = here("Output/ukbb_euro_pheno_prs.RData"))
rm(ids, prs, ukbb_euro_pheno, final_data_prs, map, x, x1, i, ids, num_cols, tmp, self_report_med)
The PRS was then calculated for all Polynesian samples. Using the results of the UKBB GWAS (only using genotyped variants), 37 PRS metrics were generated, this includes the complete PRS, the 18 PRS’s with a single variant excluded and the 18 variants individually.
make_prs_poly <- function(ids) {
# Load in plink genotype files + rename column names + filter to only include IDs of cohort of interest
map <- read_delim(here("Output/Temp/Poly_SNPs.map"), delim = "\t", col_names = FALSE)
x <- read_delim(here("Output/Temp/Poly_SNPs.ped"), delim = " ", col_names = FALSE, col_types = cols(.default = col_character()))
colnames(x)[seq(from = 7, to = ncol(x) - 1, by = 2)] <- str_c(map$X2, "_1")
colnames(x)[seq(from = 8, to = (ncol(x)), by = 2)] <- str_c(map$X2, "_2")
colnames(x)[1:6] <- c("FID", "IID", "PID", "MID", "SEX", "AFF")
x <- x %>% filter(IID %in% ids)
# Convert character genotypes into numeric genotypes based on risk allele = 1
num_cols <- ncol(x)
for (i in 1:nrow(Poly_Gene_OR)) {
x[[2 * i + 5]] <- x[[2 * i + 5]] %>%
str_replace("0", "NA") %>%
str_replace(Poly_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(Poly_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x[[2 * i + 6]] <- x[[2 * i + 6]] %>%
str_replace("0", "NA") %>%
str_replace(Poly_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(Poly_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x <- x %>%
mutate("TEMP" = (x[[2 * i + 5]] + x[[2 * i + 6]]))
colnames(x) <- c(colnames(x[1:((num_cols - 1) + i)]), Poly_Gene_OR[[i, "RSID"]])
}
x <- x %>%
select(2, (num_cols + 1):ncol(x))
# Save this dataframe for individual SNP analysis
x1 <- x
# Now to calculate the PRS
for(i in 1:nrow(Poly_Gene_OR)) {
x[i + 1] <- x[[i + 1]] * Poly_Gene_OR[[i, "OR"]]
}
x$PRS <- rowSums(x[2:(ncol(x))])
# Calculating leave-one-out PRS's
num_cols <- ncol(x)
for(i in 1:(ncol(x) - 2)) {
x[i + 1] <- x[[num_cols]] - x[[i + 1]]
}
x <- x %>%
select(IID, PRS, 2:(ncol(x) - 1))
colnames(x)[3:ncol(x)] <- str_c(paste0(colnames(x[2]), "_Less_"), colnames(x[3:ncol(x)]))
# Joining them back together
x <- x %>% left_join(x1)
assign("prs", x, envir = .GlobalEnv)
}
# East Polynesian cases
ids <- eastpoly_pheno$Geno.SampleID
make_prs_poly(ids = ids)
eastpoly_pheno_prs <- eastpoly_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID")) %>%
filter(!is.na(PRS))
save(eastpoly_pheno_prs, file = here("Output/eastpoly_pheno_prs.RData"))
rm(ids, prs, eastpoly_pheno)
# East Polynesian controls
ids <- control_eastpoly_pheno$Geno.SampleID
make_prs_poly(ids = ids)
control_eastpoly_pheno_prs <- control_eastpoly_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID")) %>%
filter(!is.na(PRS))
save(control_eastpoly_pheno_prs, file = here("Output/control_eastpoly_pheno_prs.RData"))
rm(ids, prs, control_eastpoly_pheno)
# West Polynesian cases
ids <- westpoly_pheno$Geno.SampleID
make_prs_poly(ids = ids)
westpoly_pheno_prs <- westpoly_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID")) %>%
filter(!is.na(PRS))
save(westpoly_pheno_prs, file = here("Output/westpoly_pheno_prs.RData"))
rm(ids, prs, westpoly_pheno)
# West Polynesian controls
ids <- control_westpoly_pheno$Geno.SampleID
make_prs_poly(ids = ids)
control_westpoly_pheno_prs <- control_westpoly_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID")) %>%
filter(!is.na(PRS))
save(control_westpoly_pheno_prs, file = here("Output/control_westpoly_pheno_prs.RData"))
rm(ids, prs, control_westpoly_pheno)
# Australian/NZ cases
ids <- anz_euro_pheno$Geno.SampleID
make_prs_poly(ids = ids)
anz_euro_pheno_prs_poly <- anz_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID")) %>%
filter(!is.na(PRS))
save(anz_euro_pheno_prs_poly, file = here("Output/anz_euro_pheno_prs_poly.RData"))
rm(ids, prs, anz_euro_pheno)
# Australian/NZ controls
ids <- control_anz_euro_pheno$Geno.SampleID
make_prs_poly(ids = ids)
control_anz_euro_pheno_prs_poly <- control_anz_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID")) %>%
filter(!is.na(PRS))
save(control_anz_euro_pheno_prs_poly, file = here("Output/control_anz_euro_pheno_prs_poly.RData"))
rm(ids, prs, control_anz_euro_pheno)
# Ardea - CLEAR1, CLEAR2 and LASSO (ULT)
ids <- clear_lasso_euro_pheno$Geno.SampleID
make_prs_poly(ids = ids)
clear_lasso_euro_pheno_prs_poly <- clear_lasso_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID")) %>%
filter(!is.na(PRS))
save(clear_lasso_euro_pheno_prs_poly, file = here("Output/clear_lasso_euro_pheno_prs_poly.RData"))
rm(ids, prs, clear_lasso_euro_pheno)
# Ardea - CRYSTAL
ids <- crystal_euro_pheno$Geno.SampleID
make_prs_poly(ids = ids)
crystal_euro_pheno_prs_poly <- crystal_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID")) %>%
filter(!is.na(PRS))
save(crystal_euro_pheno_prs_poly, file = here("Output/crystal_euro_pheno_prs_poly.RData"))
rm(ids, prs, crystal_euro_pheno)
# EuroGout
ids <- eurogout_euro_pheno$Geno.SampleID
make_prs_poly(ids = ids)
eurogout_euro_pheno_prs_poly <- eurogout_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID")) %>%
filter(!is.na(PRS))
save(eurogout_euro_pheno_prs_poly, file = here("Output/eurogout_euro_pheno_prs_poly.RData"))
rm(ids, prs, eurogout_euro_pheno)
# Ardea - LASSO (all others)
ids <- lasso_other_euro_pheno$Geno.SampleID
make_prs_poly(ids = ids)
lasso_other_euro_pheno_prs_poly <- lasso_other_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID")) %>%
filter(!is.na(PRS))
save(lasso_other_euro_pheno_prs_poly, file = here("Output/lasso_other_euro_pheno_prs_poly.RData"))
rm(ids, prs, lasso_other_euro_pheno)
# Ardea - LIGHT
ids <- light_euro_pheno$Geno.SampleID
make_prs_poly(ids = ids)
light_euro_pheno_prs_poly <- light_euro_pheno %>%
left_join(prs, by = c("Geno.SampleID" = "IID")) %>%
filter(!is.na(PRS))
save(light_euro_pheno_prs_poly, file = here("Output/light_euro_pheno_prs_poly.RData"))
rm(ids, prs, light_euro_pheno, make_prs_poly)
# UK Biobank
load("/Volumes/userdata/student_users/nicksumpter/Documents/PhD/Cluster/self_report_med.RData")
tmp <- self_report_med %>%
mutate(IID = eid,
ULT = case_when(allopurinol == TRUE | sulphinpyrazone == TRUE | probenecid == TRUE ~ TRUE, TRUE ~ FALSE)) %>%
select(IID, ULT)
load("/Volumes/userdata/student_users/nicksumpter/Documents/PhD/Cluster/final_data_prs.RData")
ukbb_euro_pheno <- final_data_prs %>%
select(eid, gout, age:urate) %>%
rename(IID = eid,
GOUT = gout,
AGECOL = age,
SEX = sex,
SURICACID = urate) %>%
mutate(COHORT = "UK Biobank") %>%
left_join(tmp) %>%
select(IID, COHORT, SEX, AGECOL, GOUT, SURICACID, ULT)
ids <- ukbb_euro_pheno$IID
# Load in plink genotype files + rename column names + filter to only include IDs of cohort of interest
map <- read_delim(here("Output/Temp/merged_PRS.map"), delim = "\t", col_names = FALSE)
map <- map %>%
separate(X2, into = c("X2", "remove"), sep = ",") %>%
select(-remove)
x <- read_delim(here("Output/Temp/merged_PRS.ped"), delim = " ", col_names = FALSE, col_types = cols(.default = col_character()))
colnames(x)[seq(from = 7, to = ncol(x) - 1, by = 2)] <- str_c(map$X2, "_1")
colnames(x)[seq(from = 8, to = (ncol(x)), by = 2)] <- str_c(map$X2, "_2")
colnames(x)[1:6] <- c("FID", "IID", "PID", "MID", "SEX", "AFF")
tmp <- c("FID", "IID", "PID", "MID", "SEX", "AFF", paste0(Poly_Gene_OR$RSID, "_1"), paste0(Poly_Gene_OR$RSID, "_2"))
x <- x[, colnames(x) %in% tmp]
x <- x %>% mutate(IID = as.numeric(IID)) %>% filter(IID %in% ids)
# Convert character genotypes into numeric genotypes based on risk allele = 1
num_cols <- ncol(x)
for (i in 1:nrow(Poly_Gene_OR)) {
x[[2 * i + 5]] <- x[[2 * i + 5]] %>%
str_replace("0", "NA") %>%
str_replace(Poly_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(Poly_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x[[2 * i + 6]] <- x[[2 * i + 6]] %>%
str_replace("0", "NA") %>%
str_replace(Poly_Gene_OR[[i, "Effect_Allele"]], "1") %>%
str_replace(Poly_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
as.numeric()
x <- x %>%
mutate("TEMP" = (x[[2 * i + 5]] + x[[2 * i + 6]]))
colnames(x) <- c(colnames(x[1:((num_cols - 1) + i)]), Poly_Gene_OR[[i, "RSID"]])
}
x <- x %>%
select(2, (num_cols + 1):ncol(x))
# Save this dataframe for individual SNP analysis
x1 <- x
# Now to calculate the PRS
for(i in 1:nrow(Poly_Gene_OR)) {
x[i + 1] <- x[[i + 1]] * Poly_Gene_OR[[i, "OR"]]
}
x$PRS <- rowSums(x[2:(ncol(x))])
# Calculating leave-one-out PRS's
num_cols <- ncol(x)
for(i in 1:(ncol(x) - 2)) {
x[i + 1] <- x[[num_cols]] - x[[i + 1]]
}
x <- x %>%
select(IID, PRS, 2:(ncol(x) - 1))
colnames(x)[3:ncol(x)] <- str_c(paste0(colnames(x[2]), "_Less_"), colnames(x[3:ncol(x)]))
# Joining them back together
x <- x %>% left_join(x1)
ukbb_poly_pheno_prs <- ukbb_euro_pheno %>%
left_join(x, by = c("IID"))
save(ukbb_poly_pheno_prs, file = here("Output/ukbb_poly_pheno_prs.RData"))